diff --git a/Cargo.lock b/Cargo.lock index 6f053a17..17611878 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,91 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ab_glyph" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5110f1c78cf582855d895ecd0746b653db010cec6d9f5575293f27934d980a39" +dependencies = [ + "ab_glyph_rasterizer", + "owned_ttf_parser", +] + +[[package]] +name = "ab_glyph_rasterizer" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c71b1793ee61086797f5c80b6efa2b8ffa6d5dd703f118545808a7f2e27f7046" + +[[package]] +name = "accesskit" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76eb1adf08c5bcaa8490b9851fd53cca27fa9880076f178ea9d29f05196728a8" + +[[package]] +name = "accesskit_consumer" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04bb4d9e4772fe0d47df57d0d5dbe5d85dd05e2f37ae1ddb6b105e76be58fb00" +dependencies = [ + "accesskit", +] + +[[package]] +name = "accesskit_macos" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "134d0acf6acb667c89d3332999b1a5df4edbc8d6113910f392ebb73f2b03bb56" +dependencies = [ + "accesskit", + "accesskit_consumer", + "objc2", + "once_cell", +] + +[[package]] +name = "accesskit_unix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e084cb5168790c0c112626175412dc5ad127083441a8248ae49ddf6725519e83" +dependencies = [ + "accesskit", + "accesskit_consumer", + "async-channel", + "atspi", + "futures-lite", + "serde", + "zbus", +] + +[[package]] +name = "accesskit_windows" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eac0a7f2d7cd7a93b938af401d3d8e8b7094217989a7c25c55a953023436e31" +dependencies = [ + "accesskit", + "accesskit_consumer", + "arrayvec", + "once_cell", + "paste", + "windows", +] + +[[package]] +name = "accesskit_winit" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "825d23acee1bd6d25cbaa3ca6ed6e73faf24122a774ec33d52c5c86c6ab423c0" +dependencies = [ + "accesskit", + "accesskit_macos", + "accesskit_unix", + "accesskit_windows", + "winit", +] + [[package]] name = "addr2line" version = "0.21.0" @@ -28,6 +113,17 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "0.7.20" @@ -46,6 +142,30 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-activity" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64529721f27c2314ced0890ce45e469574a73e5e6fdd6e9da1860eb29285f5e0" +dependencies = [ + "android-properties", + "bitflags 1.3.2", + "cc", + "jni-sys", + "libc", + "log", + "ndk", + "ndk-context", + "ndk-sys", + "num_enum 0.6.1", +] + +[[package]] +name = "android-properties" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7eb209b1518d6bb87b283c20095f5228ecda460da70b44f0802523dea6da04" + [[package]] name = "anstream" version = "0.3.2" @@ -101,6 +221,197 @@ version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" +[[package]] +name = "arboard" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6041616acea41d67c4a984709ddab1587fd0b10efe5cc563fee954d2f011854" +dependencies = [ + "clipboard-win", + "log", + "objc", + "objc-foundation", + "objc_id", + "once_cell", + "parking_lot", + "thiserror", + "winapi", + "x11rb", +] + +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "async-broadcast" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c48ccdbf6ca6b121e0f586cbc0e73ae440e56c67c30fa0873b4e110d9c26d2b" +dependencies = [ + "event-listener", + "futures-core", +] + +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-executor" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb" +dependencies = [ + "async-lock", + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "slab", +] + +[[package]] +name = "async-fs" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "279cf904654eeebfa37ac9bb1598880884924aab82e290aa65c9e77a0e142e06" +dependencies = [ + "async-lock", + "autocfg", + "blocking", + "futures-lite", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock", + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-lite", + "log", + "parking", + "polling", + "rustix 0.37.27", + "slab", + "socket2", + "waker-fn", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-process" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a9d28b1d97e08915212e2e45310d47854eafa69600756fc735fb788f75199c9" +dependencies = [ + "async-io", + "async-lock", + "autocfg", + "blocking", + "cfg-if", + "event-listener", + "futures-lite", + "rustix 0.37.27", + "signal-hook", + "windows-sys 0.48.0", +] + +[[package]] +name = "async-recursion" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] + +[[package]] +name = "async-task" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae" + +[[package]] +name = "async-trait" +version = "0.1.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] + +[[package]] +name = "atomic-waker" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" + +[[package]] +name = "atomic_refcell" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112ef6b3f6cb3cb6fc5b6b494ef7a848492cff1ab0ef4de10b0f7d572861c905" + +[[package]] +name = "atspi" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "674e7a3376837b2e7d12d34d58ac47073c491dc3bf6f71a7adaf687d4d817faa" +dependencies = [ + "async-recursion", + "async-trait", + "atspi-macros", + "enumflags2", + "futures-lite", + "serde", + "tracing", + "zbus", + "zbus_names", +] + +[[package]] +name = "atspi-macros" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb4870a32c0eaa17e35bca0e6b16020635157121fb7d45593d242c295bc768" +dependencies = [ + "quote", + "syn 1.0.109", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -168,7 +479,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.22", + "syn 2.0.29", "which", ] @@ -184,6 +495,12 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +[[package]] +name = "block" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a" + [[package]] name = "block-buffer" version = "0.10.4" @@ -193,6 +510,40 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-sys" +version = "0.1.0-beta.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa55741ee90902547802152aaf3f8e5248aab7e21468089560d4c8840561146" +dependencies = [ + "objc-sys", +] + +[[package]] +name = "block2" +version = "0.2.0-alpha.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dd9e63c1744f755c2f60332b88de39d341e5e86239014ad839bd71c106dec42" +dependencies = [ + "block-sys", + "objc2-encode", +] + +[[package]] +name = "blocking" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65" +dependencies = [ + "async-channel", + "async-lock", + "async-task", + "atomic-waker", + "fastrand", + "futures-lite", + "log", +] + [[package]] name = "bumpalo" version = "3.13.0" @@ -204,6 +555,20 @@ name = "bytemuck" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdde5c9cd29ebd706ce1b35600920a33550e402fc998a2e53ad3b42c3c47a192" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] [[package]] name = "byteorder" @@ -266,6 +631,20 @@ dependencies = [ "zip", ] +[[package]] +name = "calloop" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52e0d00eb1ea24371a97d2da6201c6747a633dc6dc1988ef503403b4c59504a8" +dependencies = [ + "bitflags 1.3.2", + "log", + "nix 0.25.1", + "slotmap", + "thiserror", + "vec_map", +] + [[package]] name = "cc" version = "1.0.79" @@ -275,6 +654,12 @@ dependencies = [ "jobserver", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cexpr" version = "0.6.0" @@ -290,6 +675,21 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" + +[[package]] +name = "cgl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ced0551234e87afee12411d535648dd89d2e7f34c78b753395567aff3d447ff" +dependencies = [ + "libc", +] + [[package]] name = "ci_info" version = "0.10.2" @@ -352,7 +752,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -372,6 +772,37 @@ dependencies = [ "winapi", ] +[[package]] +name = "cocoa" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f425db7937052c684daec3bd6375c8abe2d146dca4b8b143d6db777c39138f3a" +dependencies = [ + "bitflags 1.3.2", + "block", + "cocoa-foundation", + "core-foundation", + "core-graphics", + "foreign-types", + "libc", + "objc", +] + +[[package]] +name = "cocoa-foundation" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "931d3837c286f56e3c58423ce4eba12d08db2374461a785c86f672b08b5650d6" +dependencies = [ + "bitflags 1.3.2", + "block", + "core-foundation", + "core-graphics-types", + "foreign-types", + "libc", + "objc", +] + [[package]] name = "color-eyre" version = "0.6.2" @@ -385,6 +816,12 @@ dependencies = [ "owo-colors", ] +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "colorchoice" version = "1.0.0" @@ -392,14 +829,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] -name = "colored" -version = "2.0.4" +name = "colored" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" +dependencies = [ + "is-terminal", + "lazy_static", + "windows-sys 0.48.0", +] + +[[package]] +name = "combine" +version = "4.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "concurrent-queue" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" dependencies = [ - "is-terminal", - "lazy_static", - "windows-sys 0.48.0", + "crossbeam-utils", ] [[package]] @@ -436,6 +892,30 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "core-graphics" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2581bbab3b8ffc6fcbd550bf46c355135d16e9ff2a6ea032ad6b9bf1d7efe4fb" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-graphics-types", + "foreign-types", + "libc", +] + +[[package]] +name = "core-graphics-types" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb142d41022986c1d8ff29103a1411c8a3dfad3552f87a4f8dc50d61d4f4e33" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "libc", +] + [[package]] name = "cpufeatures" version = "0.2.8" @@ -484,7 +964,7 @@ dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset", + "memoffset 0.9.0", "scopeguard", ] @@ -548,6 +1028,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "derive_builder" version = "0.12.0" @@ -631,12 +1122,134 @@ dependencies = [ "winapi", ] +[[package]] +name = "dispatch" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd0c93bb4b0c6d9b77f4435b0ae98c24d17f1c45b2ff844c6151a07256ca923b" + +[[package]] +name = "dlib" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "330c60081dcc4c72131f8eb70510f1ac07223e5d4163db481a04a0befcffa412" +dependencies = [ + "libloading", +] + +[[package]] +name = "downcast-rs" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea835d29036a4087793836fa931b08837ad5e957da9e23886b29586fb9b6650" + +[[package]] +name = "ecolor" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e479a7fa3f23d4e794f8b2f8b3568dd4e47886ad1b12c9c095e141cb591eb63" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "eframe" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4596583a2c680c55b6feaa748f74890c4f9cb9c7cb69d6117110444cb65b2f" +dependencies = [ + "bytemuck", + "cocoa", + "egui", + "egui-winit", + "egui_glow", + "glow", + "glutin", + "glutin-winit", + "image", + "js-sys", + "log", + "objc", + "percent-encoding", + "raw-window-handle", + "thiserror", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winapi", + "winit", +] + +[[package]] +name = "egui" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3aef8ec3ae1b772f340170c65bf27d5b8c28f543a0116c844d2ac08d01123e7" +dependencies = [ + "accesskit", + "ahash", + "epaint", + "log", + "nohash-hasher", +] + +[[package]] +name = "egui-winit" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a49155fd4a0a4fb21224407a91de0030847972ef90fc64edb63621caea61cb2" +dependencies = [ + "accesskit_winit", + "arboard", + "egui", + "instant", + "log", + "raw-window-handle", + "smithay-clipboard", + "webbrowser", + "winit", +] + +[[package]] +name = "egui_extras" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9278f4337b526f0d57e5375e5a7340a311fa6ee8f9fcc75721ac50af13face02" +dependencies = [ + "egui", + "serde", +] + +[[package]] +name = "egui_glow" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f8c2752cdf1b0ef5fcda59a898cacabad974d4f5880e92a420b2c917022da64" +dependencies = [ + "bytemuck", + "egui", + "glow", + "log", + "memoffset 0.6.5", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "either" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +[[package]] +name = "emath" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3857d743a6e0741cdd60b622a74c7a36ea75f5f8f11b793b41d905d2c9721a4b" +dependencies = [ + "bytemuck", +] + [[package]] name = "encode_unicode" version = "0.3.6" @@ -658,6 +1271,27 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +[[package]] +name = "enumflags2" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c041f5090df68b32bcd905365fd51769c8b9d553fe87fde0b683534f10c01bd2" +dependencies = [ + "enumflags2_derive", + "serde", +] + +[[package]] +name = "enumflags2_derive" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e9a1f9f7d83e59740248a6e14ecf93929ade55027844dfcea78beafccc15745" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "env_logger" version = "0.10.0" @@ -678,9 +1312,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2d328fc287c61314c4a61af7cfdcbd7e678e39778488c7cb13ec133ce0f4059" dependencies = [ "fsio", - "indexmap", + "indexmap 1.9.3", +] + +[[package]] +name = "epaint" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09333964d4d57f40a85338ba3ca5ed4716070ab184dcfed966b35491c5c64f3b" +dependencies = [ + "ab_glyph", + "ahash", + "atomic_refcell", + "bytemuck", + "ecolor", + "emath", + "log", + "nohash-hasher", + "parking_lot", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.1" @@ -718,6 +1375,12 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "eyre" version = "0.6.8" @@ -748,6 +1411,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "fdeflate" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d329bdeac514ee06249dabc27877490f17f5d371ec693360768b838e19f3ae10" +dependencies = [ + "simd-adler32", +] + [[package]] name = "filetime" version = "0.2.21" @@ -837,6 +1509,21 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-sink" version = "0.3.28" @@ -857,6 +1544,7 @@ checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-core", "futures-io", + "futures-sink", "futures-task", "memchr", "pin-project-lite", @@ -881,6 +1569,16 @@ dependencies = [ "version_check", ] +[[package]] +name = "gethostname" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1ebd34e35c46e00bb73e81363248d627782724609fe1b6396f553f68fe3862e" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "getopts" version = "0.2.21" @@ -907,6 +1605,7 @@ version = "0.2.0-dev" dependencies = [ "anyhow", "ggml-sys", + "indexmap 2.0.2", "memmap2", "rand", "thiserror", @@ -919,18 +1618,115 @@ dependencies = [ "cc", ] +[[package]] +name = "gguf-explorer" +version = "0.1.0" +dependencies = [ + "anyhow", + "eframe", + "egui_extras", + "ggml", +] + [[package]] name = "gimli" version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +[[package]] +name = "gl_generator" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a95dfc23a2b4a9a2f5ab41d194f8bfda3cabec42af4e39f08c339eb2a0c124d" +dependencies = [ + "khronos_api", + "log", + "xml-rs", +] + [[package]] name = "glob" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "glow" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca0fe580e4b60a8ab24a868bc08e2f03cbcb20d3d676601fa909386713333728" +dependencies = [ + "js-sys", + "slotmap", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "glutin" +version = "0.30.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc93b03242719b8ad39fb26ed2b01737144ce7bd4bfc7adadcef806596760fe" +dependencies = [ + "bitflags 1.3.2", + "cfg_aliases", + "cgl", + "core-foundation", + "dispatch", + "glutin_egl_sys", + "glutin_glx_sys", + "glutin_wgl_sys", + "libloading", + "objc2", + "once_cell", + "raw-window-handle", + "wayland-sys 0.30.1", + "windows-sys 0.45.0", + "x11-dl", +] + +[[package]] +name = "glutin-winit" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629a873fc04062830bfe8f97c03773bcd7b371e23bcc465d0a61448cd1588fa4" +dependencies = [ + "cfg_aliases", + "glutin", + "raw-window-handle", + "winit", +] + +[[package]] +name = "glutin_egl_sys" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af784eb26c5a68ec85391268e074f0aa618c096eadb5d6330b0911cf34fe57c5" +dependencies = [ + "gl_generator", + "windows-sys 0.45.0", +] + +[[package]] +name = "glutin_glx_sys" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b53cb5fe568964aa066a3ba91eac5ecbac869fb0842cd0dc9e412434f1a1494" +dependencies = [ + "gl_generator", + "x11-dl", +] + +[[package]] +name = "glutin_wgl_sys" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef89398e90033fc6bc65e9bd42fd29bbbfd483bda5b56dc5562f455550618165" +dependencies = [ + "gl_generator", +] + [[package]] name = "h2" version = "0.3.20" @@ -943,7 +1739,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap", + "indexmap 1.9.3", "slab", "tokio", "tokio-util", @@ -965,6 +1761,12 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" + [[package]] name = "heck" version = "0.4.1" @@ -977,6 +1779,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "hmac" version = "0.12.1" @@ -986,6 +1794,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys 0.48.0", +] + [[package]] name = "http" version = "0.2.9" @@ -1079,6 +1896,20 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "image" +version = "0.24.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f3dfdbdd72063086ff443e297b61695500514b1e41095b6fb9a5ab48a70a711" +dependencies = [ + "bytemuck", + "byteorder", + "color_quant", + "num-rational", + "num-traits", + "png", +] + [[package]] name = "indenter" version = "0.3.3" @@ -1092,7 +1923,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" +dependencies = [ + "equivalent", + "hashbrown 0.14.2", ] [[package]] @@ -1123,6 +1964,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", ] [[package]] @@ -1177,6 +2021,28 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys", + "log", + "thiserror", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + [[package]] name = "jobserver" version = "0.1.26" @@ -1195,6 +2061,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "khronos_api" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2db585e1d738fc771bf08a151420d3ed193d9d895a36df7f6f8a9456b911ddc" + [[package]] name = "lazy_static" version = "1.4.0" @@ -1256,6 +2128,7 @@ dependencies = [ "serde", "serde_json", "spinoff", + "thiserror", "tracing", ] @@ -1266,6 +2139,7 @@ dependencies = [ "bytemuck", "ggml", "half", + "indexmap 2.0.2", "llm-samplers", "memmap2", "partial_sort", @@ -1425,6 +2299,15 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" +[[package]] +name = "malloc_buf" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb" +dependencies = [ + "libc", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1449,6 +2332,24 @@ dependencies = [ "libc", ] +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memoffset" version = "0.9.0" @@ -1477,6 +2378,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" dependencies = [ "adler", + "simd-adler32", ] [[package]] @@ -1486,6 +2388,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", + "log", "wasi", "windows-sys 0.48.0", ] @@ -1508,7 +2411,7 @@ checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -1529,6 +2432,35 @@ dependencies = [ "tempfile", ] +[[package]] +name = "ndk" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "451422b7e4718271c8b5b3aadf5adedba43dc76312454b387e98fae0fc951aa0" +dependencies = [ + "bitflags 1.3.2", + "jni-sys", + "ndk-sys", + "num_enum 0.5.11", + "raw-window-handle", + "thiserror", +] + +[[package]] +name = "ndk-context" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27b02d87554356db9e9a873add8782d4ea6e3e58ea071a9adb9a2e8ddb884a8b" + +[[package]] +name = "ndk-sys" +version = "0.4.1+23.1.7779620" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cf2aae958bd232cac5069850591667ad422d263686d75b52a065f9badeee5a3" +dependencies = [ + "jni-sys", +] + [[package]] name = "nias" version = "0.5.0" @@ -1544,6 +2476,31 @@ dependencies = [ "smallvec", ] +[[package]] +name = "nix" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", + "memoffset 0.6.5", +] + +[[package]] +name = "nix" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" +dependencies = [ + "autocfg", + "bitflags 1.3.2", + "cfg-if", + "libc", + "memoffset 0.6.5", +] + [[package]] name = "nix" version = "0.26.2" @@ -1553,9 +2510,16 @@ dependencies = [ "bitflags 1.3.2", "cfg-if", "libc", + "memoffset 0.7.1", "static_assertions", ] +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + [[package]] name = "nom" version = "7.1.3" @@ -1576,6 +2540,27 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.15" @@ -1595,12 +2580,109 @@ dependencies = [ "libc", ] +[[package]] +name = "num_enum" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9" +dependencies = [ + "num_enum_derive 0.5.11", +] + +[[package]] +name = "num_enum" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a015b430d3c108a207fd776d2e2196aaf8b1cf8cf93253e3a097ff3085076a1" +dependencies = [ + "num_enum_derive 0.6.1", +] + +[[package]] +name = "num_enum_derive" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "num_enum_derive" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96667db765a921f7b295ffee8b60472b686a51d4f21c2ee4ffdb94c7013b65a6" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "number_prefix" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "objc" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1" +dependencies = [ + "malloc_buf", +] + +[[package]] +name = "objc-foundation" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1add1b659e36c9607c7aab864a76c7a4c2760cd0cd2e120f3fb8b952c7e22bf9" +dependencies = [ + "block", + "objc", + "objc_id", +] + +[[package]] +name = "objc-sys" +version = "0.2.0-beta.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b9834c1e95694a05a828b59f55fa2afec6288359cda67146126b3f90a55d7" + +[[package]] +name = "objc2" +version = "0.3.0-beta.3.patch-leaks.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e01640f9f2cb1220bbe80325e179e532cb3379ebcd1bf2279d703c19fe3a468" +dependencies = [ + "block2", + "objc-sys", + "objc2-encode", +] + +[[package]] +name = "objc2-encode" +version = "2.0.0-pre.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abfcac41015b00a120608fdaa6938c44cb983fee294351cc4bac7638b4e50512" +dependencies = [ + "objc-sys", +] + +[[package]] +name = "objc_id" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c92d4ddb4bd7b50d730c215ff871754d0da6b2178849f8a2a2ab69712d0c073b" +dependencies = [ + "objc", +] + [[package]] name = "object" version = "0.32.1" @@ -1661,7 +2743,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -1682,18 +2764,52 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "orbclient" +version = "0.3.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8378ac0dfbd4e7895f2d2c1f1345cab3836910baf3a300b000d04250f0c8428f" +dependencies = [ + "redox_syscall 0.3.5", +] + +[[package]] +name = "ordered-stream" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aa2b01e1d916879f73a53d01d1d6cee68adbb31d6d9177a8cfce093cced1d50" +dependencies = [ + "futures-core", + "pin-project-lite", +] + [[package]] name = "overload" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "owned_ttf_parser" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "706de7e2214113d63a8238d1910463cfce781129a6f263d13fdb09ff64355ba4" +dependencies = [ + "ttf-parser", +] + [[package]] name = "owo-colors" version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" + [[package]] name = "parking_lot" version = "0.12.1" @@ -1782,6 +2898,35 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +[[package]] +name = "png" +version = "0.17.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd75bf2d8dd3702b9707cdbc56a5b9ef42cec752eb8b3bafc01234558442aa64" +dependencies = [ + "bitflags 1.3.2", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "polling" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" +dependencies = [ + "autocfg", + "bitflags 1.3.2", + "cfg-if", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys 0.48.0", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -1799,7 +2944,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9825a04601d60621feed79c4e6b56d65db77cdca55cef43b46b0de1096d1c282" dependencies = [ "proc-macro2", - "syn 2.0.22", + "syn 2.0.29", +] + +[[package]] +name = "proc-macro-crate" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" +dependencies = [ + "once_cell", + "toml_edit", ] [[package]] @@ -1860,6 +3015,12 @@ dependencies = [ "getrandom", ] +[[package]] +name = "raw-window-handle" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ff9a1f06a88b01621b7ae906ef0211290d1c8a168a15542486a8f61c0833b9" + [[package]] name = "rayon" version = "1.7.0" @@ -2056,7 +3217,7 @@ dependencies = [ "libc", "log", "memchr", - "nix", + "nix 0.26.2", "radix_trie", "rustyline-derive", "scopeguard", @@ -2083,6 +3244,15 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.21" @@ -2092,12 +3262,31 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "sctk-adwaita" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda4e97be1fd174ccc2aae81c8b694e803fa99b34e8fd0f057a9d70698e3ed09" +dependencies = [ + "ab_glyph", + "log", + "memmap2", + "smithay-client-toolkit", + "tiny-skia", +] + [[package]] name = "security-framework" version = "2.9.1" @@ -2147,7 +3336,7 @@ checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -2161,6 +3350,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_repr" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8725e1dfadb3a50f7e5ce0b1a540466f6ed3fe7a0fca2ac2b8b831d31316bd00" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2210,6 +3410,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +[[package]] +name = "signal-hook" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" +dependencies = [ + "libc", + "signal-hook-registry", +] + [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -2219,6 +3429,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "slab" version = "0.4.8" @@ -2228,12 +3444,50 @@ dependencies = [ "autocfg", ] +[[package]] +name = "slotmap" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e08e261d0e8f5c43123b7adf3e4ca1690d655377ac93a03b2c9d3e98de1342" +dependencies = [ + "version_check", +] + [[package]] name = "smallvec" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +[[package]] +name = "smithay-client-toolkit" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f307c47d32d2715eb2e0ece5589057820e0e5e70d07c247d1063e844e107f454" +dependencies = [ + "bitflags 1.3.2", + "calloop", + "dlib", + "lazy_static", + "log", + "memmap2", + "nix 0.24.3", + "pkg-config", + "wayland-client", + "wayland-cursor", + "wayland-protocols", +] + +[[package]] +name = "smithay-clipboard" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a345c870a1fae0b1b779085e81b51e614767c239e93503588e54c5b17f4b0e8" +dependencies = [ + "smithay-client-toolkit", + "wayland-client", +] + [[package]] name = "socket2" version = "0.4.9" @@ -2277,7 +3531,13 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" name = "str-buf" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" +checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" + +[[package]] +name = "strict-num" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6637bab7722d379c8b41ba849228d680cc12d0a45ba1fa2b48f2a30577a06731" [[package]] name = "strsim" @@ -2304,9 +3564,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.22" +version = "2.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2efbeae7acf4eabd6bcdcbd11c92f45231ddda7539edc7806bd1a04a03b24616" +checksum = "c324c494eba9d92503e6f1ef2e6df781e78f6a7705a0202d9801b198807d518a" dependencies = [ "proc-macro2", "quote", @@ -2364,7 +3624,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -2404,6 +3664,31 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-skia" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df8493a203431061e901613751931f047d1971337153f96d0e5e363d6dbf6a67" +dependencies = [ + "arrayref", + "arrayvec", + "bytemuck", + "cfg-if", + "png", + "tiny-skia-path", +] + +[[package]] +name = "tiny-skia-path" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adbfb5d3f3dd57a0e11d12f4f13d4ebbbc1b5c15b7ab0a156d030b21da5f677c" +dependencies = [ + "arrayref", + "bytemuck", + "strict-num", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -2481,7 +3766,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -2517,6 +3802,23 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" + +[[package]] +name = "toml_edit" +version = "0.19.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8123f27e969974a3dfba720fdb560be359f57b44302d280ba72e76a74480e8a" +dependencies = [ + "indexmap 2.0.2", + "toml_datetime", + "winnow", +] + [[package]] name = "tower-service" version = "0.3.2" @@ -2555,7 +3857,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", ] [[package]] @@ -2603,12 +3905,28 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +[[package]] +name = "ttf-parser" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a464a4b34948a5f67fddd2b823c62d9d92e44be75058b99939eae6c5b6960b33" + [[package]] name = "typenum" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "uds_windows" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce65604324d3cce9b966701489fbd0cf318cb1f7bd9dd07ac9a4ee6fb791930d" +dependencies = [ + "tempfile", + "winapi", +] + [[package]] name = "unicode-bidi" version = "0.3.13" @@ -2686,12 +4004,34 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + +[[package]] +name = "walkdir" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2728,7 +4068,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", "wasm-bindgen-shared", ] @@ -2762,7 +4102,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.29", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2773,6 +4113,91 @@ version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +[[package]] +name = "wayland-client" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f3b068c05a039c9f755f881dc50f01732214f5685e379829759088967c46715" +dependencies = [ + "bitflags 1.3.2", + "downcast-rs", + "libc", + "nix 0.24.3", + "scoped-tls", + "wayland-commons", + "wayland-scanner", + "wayland-sys 0.29.5", +] + +[[package]] +name = "wayland-commons" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8691f134d584a33a6606d9d717b95c4fa20065605f798a3f350d78dced02a902" +dependencies = [ + "nix 0.24.3", + "once_cell", + "smallvec", + "wayland-sys 0.29.5", +] + +[[package]] +name = "wayland-cursor" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6865c6b66f13d6257bef1cd40cbfe8ef2f150fb8ebbdb1e8e873455931377661" +dependencies = [ + "nix 0.24.3", + "wayland-client", + "xcursor", +] + +[[package]] +name = "wayland-protocols" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b950621f9354b322ee817a23474e479b34be96c2e909c14f7bc0100e9a970bc6" +dependencies = [ + "bitflags 1.3.2", + "wayland-client", + "wayland-commons", + "wayland-scanner", +] + +[[package]] +name = "wayland-scanner" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f4303d8fa22ab852f789e75a967f0a2cdc430a607751c0499bada3e451cbd53" +dependencies = [ + "proc-macro2", + "quote", + "xml-rs", +] + +[[package]] +name = "wayland-sys" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be12ce1a3c39ec7dba25594b97b42cb3195d54953ddb9d3d95a7c3902bc6e9d4" +dependencies = [ + "dlib", + "lazy_static", + "pkg-config", +] + +[[package]] +name = "wayland-sys" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b2a02ac608e07132978689a6f9bf4214949c85998c247abadd4f4129b1aa06" +dependencies = [ + "dlib", + "lazy_static", + "log", + "pkg-config", +] + [[package]] name = "web-sys" version = "0.3.64" @@ -2783,6 +4208,23 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webbrowser" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c79b77f525a2d670cb40619d7d9c673d09e0666f72c591ebd7861f84a87e57" +dependencies = [ + "core-foundation", + "home", + "jni", + "log", + "ndk-context", + "objc", + "raw-window-handle", + "url", + "web-sys", +] + [[package]] name = "which" version = "4.4.0" @@ -2819,12 +4261,54 @@ dependencies = [ "winapi", ] +[[package]] +name = "winapi-wsapoll" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c17110f57155602a80dca10be03852116403c9ff3cd25b079d666f2aa3df6e" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-targets 0.48.1", +] + +[[package]] +name = "windows-implement" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e2ee588991b9e7e6c8338edf3333fbe4da35dc72092643958ebb43f0ab2c49c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "windows-interface" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6fb8df20c9bcaa8ad6ab513f7b40104840c8867d5751126e4df3b08388d0cc7" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "windows-sys" version = "0.42.0" @@ -2972,6 +4456,50 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "winit" +version = "0.28.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "866db3f712fffba75d31bf0cdecf357c8aeafd158c5b7ab51dba2a2b2d47f196" +dependencies = [ + "android-activity", + "bitflags 1.3.2", + "cfg_aliases", + "core-foundation", + "core-graphics", + "dispatch", + "instant", + "libc", + "log", + "mio", + "ndk", + "objc2", + "once_cell", + "orbclient", + "percent-encoding", + "raw-window-handle", + "redox_syscall 0.3.5", + "sctk-adwaita", + "smithay-client-toolkit", + "wasm-bindgen", + "wayland-client", + "wayland-commons", + "wayland-protocols", + "wayland-scanner", + "web-sys", + "windows-sys 0.45.0", + "x11-dl", +] + +[[package]] +name = "winnow" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c2e3184b9c4e92ad5167ca73039d0c42476302ab603e2fec4487511f38ccefc" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.10.1" @@ -2981,6 +4509,39 @@ dependencies = [ "winapi", ] +[[package]] +name = "x11-dl" +version = "2.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38735924fedd5314a6e548792904ed8c6de6636285cb9fec04d5b1db85c1516f" +dependencies = [ + "libc", + "once_cell", + "pkg-config", +] + +[[package]] +name = "x11rb" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "592b4883219f345e712b3209c62654ebda0bb50887f330cbd018d0f654bfd507" +dependencies = [ + "gethostname", + "nix 0.24.3", + "winapi", + "winapi-wsapoll", + "x11rb-protocol", +] + +[[package]] +name = "x11rb-protocol" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56b245751c0ac9db0e006dc812031482784e434630205a93c73cfefcaabeac67" +dependencies = [ + "nix 0.24.3", +] + [[package]] name = "xattr" version = "0.2.3" @@ -2990,6 +4551,97 @@ dependencies = [ "libc", ] +[[package]] +name = "xcursor" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "463705a63313cd4301184381c5e8042f0a7e9b4bb63653f216311d4ae74690b7" +dependencies = [ + "nom", +] + +[[package]] +name = "xdg-home" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2769203cd13a0c6015d515be729c526d041e9cf2c0cc478d57faee85f40c6dcd" +dependencies = [ + "nix 0.26.2", + "winapi", +] + +[[package]] +name = "xml-rs" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47430998a7b5d499ccee752b41567bc3afc57e1327dc855b1a2aa44ce29b5fa1" + +[[package]] +name = "zbus" +version = "3.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31de390a2d872e4cd04edd71b425e29853f786dc99317ed72d73d6fcf5ebb948" +dependencies = [ + "async-broadcast", + "async-executor", + "async-fs", + "async-io", + "async-lock", + "async-process", + "async-recursion", + "async-task", + "async-trait", + "blocking", + "byteorder", + "derivative", + "enumflags2", + "event-listener", + "futures-core", + "futures-sink", + "futures-util", + "hex", + "nix 0.26.2", + "once_cell", + "ordered-stream", + "rand", + "serde", + "serde_repr", + "sha1", + "static_assertions", + "tracing", + "uds_windows", + "winapi", + "xdg-home", + "zbus_macros", + "zbus_names", + "zvariant", +] + +[[package]] +name = "zbus_macros" +version = "3.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d1794a946878c0e807f55a397187c11fc7a038ba5d868e7db4f3bd7760bc9d" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "regex", + "syn 1.0.109", + "zvariant_utils", +] + +[[package]] +name = "zbus_names" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb80bb776dbda6e23d705cf0123c3b95df99c4ebeaec6c2599d4a5419902b4a9" +dependencies = [ + "serde", + "static_assertions", + "zvariant", +] + [[package]] name = "zip" version = "0.6.6" @@ -3058,3 +4710,41 @@ dependencies = [ "libc", "pkg-config", ] + +[[package]] +name = "zvariant" +version = "3.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44b291bee0d960c53170780af148dca5fa260a63cdd24f1962fa82e03e53338c" +dependencies = [ + "byteorder", + "enumflags2", + "libc", + "serde", + "static_assertions", + "zvariant_derive", +] + +[[package]] +name = "zvariant_derive" +version = "3.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "934d7a7dfc310d6ee06c87ffe88ef4eca7d3e37bb251dece2ef93da8f17d8ecd" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 1.0.109", + "zvariant_utils", +] + +[[package]] +name = "zvariant_utils" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7234f0d811589db492d16893e3f21e8e2fd282e6d01b0cddee310322062cc200" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] diff --git a/Cargo.toml b/Cargo.toml index 045ecc9e..6c215d50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ memmap2 = "0.5.10" tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing = { version = "0.1", features = ["log"] } llm-samplers = "=0.0.7" +indexmap = "2.0.2" # Config for 'cargo dist' [workspace.metadata.dist] diff --git a/binaries/gguf-explorer/Cargo.toml b/binaries/gguf-explorer/Cargo.toml new file mode 100644 index 00000000..85a7aa78 --- /dev/null +++ b/binaries/gguf-explorer/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "gguf-explorer" +version = "0.1.0" +edition = "2021" +repository = { workspace = true } +license = { workspace = true } +publish = false + +[package.metadata.release] +release = false + +[dependencies] +ggml = { path = "../../crates/ggml" } + +anyhow = { workspace = true } + +eframe = "0.22" +egui_extras = "0.22" diff --git a/binaries/gguf-explorer/src/main.rs b/binaries/gguf-explorer/src/main.rs new file mode 100644 index 00000000..bb37bce2 --- /dev/null +++ b/binaries/gguf-explorer/src/main.rs @@ -0,0 +1,220 @@ +use std::{fmt::Display, fs::File, io::BufReader}; + +use egui_extras::{Column, TableBuilder}; +use ggml::format::gguf::{self, Gguf}; + +use eframe::egui::{self, Button, CentralPanel, CollapsingHeader, Label, RichText, TopBottomPanel}; + +fn main() -> eframe::Result<()> { + let file_path = match std::env::args().nth(1) { + Some(path) => path, + None => { + eprintln!("Usage: gguf-explorer "); + std::process::exit(1); + } + }; + + let mut file = File::open(file_path).expect("Failed to open file"); + let gguf = Gguf::load(&mut BufReader::new(&mut file)).expect("Failed to load gguf file"); + + let native_options = eframe::NativeOptions::default(); + eframe::run_native( + "GGUF Explorer", + native_options, + Box::new(move |_cc| { + Box::new(Explorer { + _file: file, + gguf, + + selected_tab: Tab::Metadata, + tensor_sort_order: TensorColumn::Offset, + }) + }), + ) +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub enum Tab { + Metadata, + Tensors, +} +impl Display for Tab { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Tab::Metadata => write!(f, "Metadata"), + Tab::Tensors => write!(f, "Tensors"), + } + } +} +impl Tab { + const ALL: [Tab; 2] = [Tab::Metadata, Tab::Tensors]; +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub enum TensorColumn { + Name, + Dimensions, + Type, + Offset, +} +impl Display for TensorColumn { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TensorColumn::Name => write!(f, "Name"), + TensorColumn::Dimensions => write!(f, "Dimensions"), + TensorColumn::Type => write!(f, "Type"), + TensorColumn::Offset => write!(f, "Offset"), + } + } +} +impl TensorColumn { + const ALL: [Self; 4] = [Self::Name, Self::Dimensions, Self::Type, Self::Offset]; +} + +struct Explorer { + _file: File, + gguf: Gguf, + + selected_tab: Tab, + tensor_sort_order: TensorColumn, +} +impl eframe::App for Explorer { + fn update(&mut self, ctx: &egui::Context, _frame: &mut eframe::Frame) { + TopBottomPanel::top("top_panel").show(ctx, |ui| { + ui.horizontal(|ui| { + for tab in Tab::ALL.iter().copied() { + let text = RichText::from(tab.to_string()); + let text = if tab == self.selected_tab { + text.underline() + } else { + text + }; + + if ui.add(Button::new(text)).clicked() { + self.selected_tab = tab; + } + } + }); + }); + + CentralPanel::default().show(ctx, |ui| match self.selected_tab { + Tab::Metadata => { + self.render_metadata(ui); + } + Tab::Tensors => { + self.render_tensors(ui); + } + }); + } +} +impl Explorer { + fn render_metadata(&mut self, ui: &mut egui::Ui) { + let metadata = &self.gguf.metadata; + let mut metadata_keys = metadata.keys().collect::>(); + metadata_keys.sort_by_key(|k| *k); + + TableBuilder::new(ui) + .striped(true) + .auto_shrink([false, true]) + .column(Column::auto().resizable(true)) + .column(Column::remainder().resizable(true)) + .header(20.0, |mut header| { + header.col(|ui| { + ui.label("Key"); + }); + header.col(|ui| { + ui.label("Value"); + }); + }) + .body(|mut body| { + for key in metadata_keys { + let value = metadata.get_optional(key).unwrap(); + + body.row(30.0, |mut row| { + row.col(|ui| { + ui.add(Label::new(monospace(key)).wrap(false)); + }); + row.col(|ui| match value { + gguf::MetadataValue::Array(value) => { + CollapsingHeader::new(format!("array ({} elements)", value.len())) + .id_source(key) + .show(ui, |ui| { + ui.add( + Label::new(monospace(format!("{:?}", value))) + .wrap(false), + ); + }); + } + value => { + ui.add(Label::new(monospace(format!("{:?}", value))).wrap(false)); + } + }); + }); + } + }); + } + + fn render_tensors(&mut self, ui: &mut egui::Ui) { + let tensors = &self.gguf.tensor_infos; + let mut tensor_names = tensors.keys().collect::>(); + match self.tensor_sort_order { + TensorColumn::Name => tensor_names.sort_by_key(|k| *k), + TensorColumn::Dimensions => { + tensor_names.sort_by_key(|k| tensors[*k].dimensions.clone()) + } + TensorColumn::Type => tensor_names.sort_by_key(|k| tensors[*k].element_type), + TensorColumn::Offset => tensor_names.sort_by_key(|k| tensors[*k].offset), + } + + TableBuilder::new(ui) + .striped(true) + .auto_shrink([false, true]) + .column(Column::remainder().resizable(true)) + .columns(Column::auto().resizable(true), 3) + .header(20.0, |mut header| { + for column in TensorColumn::ALL.iter().copied() { + header.col(|ui| { + let text = RichText::from(column.to_string()); + let text = if self.tensor_sort_order == column { + text.underline() + } else { + text + }; + + if ui.add(Button::new(text).wrap(false)).clicked() { + self.tensor_sort_order = column; + } + }); + } + }) + .body(|mut body| { + for tensor_name in tensor_names { + let tensor = &tensors[tensor_name]; + + body.row(30.0, |mut row| { + row.col(|ui| { + ui.add(Label::new(monospace(tensor_name)).wrap(false)); + }); + row.col(|ui| { + ui.add( + Label::new(monospace(format!("{:?}", tensor.dimensions))) + .wrap(false), + ); + }); + row.col(|ui| { + ui.add( + Label::new(monospace(tensor.element_type.to_string())).wrap(false), + ); + }); + row.col(|ui| { + ui.add(Label::new(monospace(tensor.offset.to_string())).wrap(false)); + }); + }); + } + }); + } +} + +fn monospace(text: impl Into) -> RichText { + RichText::new(text).monospace() +} diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index e158db68..da513180 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -1,5 +1,4 @@ use std::{ - fmt, ops::Deref, path::{Path, PathBuf}, }; @@ -7,7 +6,7 @@ use std::{ use clap::{Parser, ValueEnum}; use color_eyre::eyre::{self, WrapErr}; use llm::{ - ggml_format, samplers::build_sampler, ElementType, InferenceParameters, InferenceSessionConfig, + samplers::build_sampler, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias, LoadProgress, Model, ModelKVMemoryType, ModelParameters, RoPEOverrides, TokenBias, TokenId, TokenizerSource, }; @@ -25,8 +24,11 @@ pub enum Args { Perplexity(Box), #[command()] - /// Get information about a GGML model. - Info(Box), + /// Interact with a GGUF model. + Gguf { + #[command(subcommand)] + gguf: Gguf, + }, #[command()] /// Dumps the prompt to console and exits, first as a comma-separated list of token IDs @@ -45,9 +47,9 @@ pub enum Args { /// and do not support a long enough context window to be able to /// have an extended conversation. Chat(Box), - - /// Quantize a GGML model to 4-bit. - Quantize(Box), + // + // /// Quantize a GGML model to 4-bit. + // Quantize(Box), } #[derive(Parser, Debug)] @@ -112,6 +114,13 @@ pub struct Perplexity { pub prompt: Prompt, } +#[derive(Parser, Debug)] +pub enum Gguf { + Info(Info), + Rebuild(Rebuild), + AddHfTokenizer(AddHfTokenizer), +} + #[derive(Parser, Debug)] pub struct Info { #[command(flatten)] @@ -126,6 +135,19 @@ pub struct Info { pub tokenizer: bool, } +#[derive(Parser, Debug)] +pub struct Rebuild { + pub input: PathBuf, + pub output: PathBuf, +} + +#[derive(Parser, Debug)] +pub struct AddHfTokenizer { + pub input: PathBuf, + pub output: PathBuf, + pub tokenizer: String, +} + #[derive(Parser, Debug)] pub struct PromptTokens { #[command(flatten)] @@ -436,22 +458,12 @@ impl ModelTokenizer { } } -#[derive(Parser, Debug)] -pub struct ModelArchitecture { - /// The model architecture to use. Will attempt to guess if not specified. - #[arg(long, short = 'a')] - pub model_architecture: Option, -} - #[derive(Parser, Debug)] pub struct ModelAndTokenizer { /// Where to load the model from #[arg(long, short = 'm')] pub model_path: PathBuf, - #[command(flatten)] - pub architecture: ModelArchitecture, - #[command(flatten)] pub tokenizer: ModelTokenizer, } @@ -528,7 +540,6 @@ impl ModelLoad { use_gpu, gpu_layers: self.gpu_layers, rope_overrides: self.rope_scaling.to_rope_arguments(), - n_gqa: None, }; let mut sp = Some(spinoff::Spinner::new( @@ -549,8 +560,7 @@ impl ModelLoad { } }; - let model = llm::load_dynamic( - self.model_and_tokenizer.architecture.model_architecture, + let model = llm::load( &self.model_and_tokenizer.model_path, tokenizer_source, params, @@ -639,56 +649,56 @@ pub fn read_prompt_file(path: &Path) -> eyre::Result { .wrap_err_with(|| format!("Could not read prompt file at {path:?}")) } -#[derive(Parser, Debug)] -pub struct Quantize { - #[command(flatten)] - pub architecture: ModelArchitecture, - - /// The path to the model to quantize - #[arg()] - pub source: PathBuf, - - /// The path to save the quantized model to - #[arg()] - pub destination: PathBuf, - - #[command(flatten)] - pub tokenizer: ModelTokenizer, - - /// The GGML container type to target. - /// - /// Note that using GGML requires the original model to have - /// an unscored vocabulary, which is not the case for newer models. - #[arg(short, long, default_value_t = SaveContainerType::GgjtV3)] - pub container_type: SaveContainerType, - - /// The format to convert to - pub target: QuantizationTarget, -} - -#[derive(Parser, Debug, ValueEnum, Clone, Copy)] -pub enum SaveContainerType { - /// GGML container. - Ggml, - /// GGJT v3 container. - GgjtV3, -} -impl fmt::Display for SaveContainerType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - SaveContainerType::Ggml => write!(f, "ggml"), - SaveContainerType::GgjtV3 => write!(f, "ggjt-v3"), - } - } -} -impl From for ggml_format::SaveContainerType { - fn from(value: SaveContainerType) -> Self { - match value { - SaveContainerType::Ggml => ggml_format::SaveContainerType::Ggml, - SaveContainerType::GgjtV3 => ggml_format::SaveContainerType::GgjtV3, - } - } -} +// #[derive(Parser, Debug)] +// pub struct Quantize { +// #[command(flatten)] +// pub architecture: ModelArchitecture, + +// /// The path to the model to quantize +// #[arg()] +// pub source: PathBuf, + +// /// The path to save the quantized model to +// #[arg()] +// pub destination: PathBuf, + +// #[command(flatten)] +// pub tokenizer: ModelTokenizer, + +// /// The GGML container type to target. +// /// +// /// Note that using GGML requires the original model to have +// /// an unscored vocabulary, which is not the case for newer models. +// #[arg(short, long, default_value_t = SaveContainerType::GgjtV3)] +// pub container_type: SaveContainerType, + +// /// The format to convert to +// pub target: QuantizationTarget, +// } + +// #[derive(Parser, Debug, ValueEnum, Clone, Copy)] +// pub enum SaveContainerType { +// /// GGML container. +// Ggml, +// /// GGJT v3 container. +// GgjtV3, +// } +// impl fmt::Display for SaveContainerType { +// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +// match self { +// SaveContainerType::Ggml => write!(f, "ggml"), +// SaveContainerType::GgjtV3 => write!(f, "ggjt-v3"), +// } +// } +// } +// impl From for ggml_format::ggml::SaveContainerType { +// fn from(value: SaveContainerType) -> Self { +// match value { +// SaveContainerType::Ggml => ggml_format::ggml::SaveContainerType::Ggml, +// SaveContainerType::GgjtV3 => ggml_format::ggml::SaveContainerType::GgjtV3, +// } +// } +// } #[derive(Parser, Debug, ValueEnum, Clone, Copy)] #[clap(rename_all = "snake_case")] diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 242b42c8..08657207 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -1,13 +1,16 @@ use std::{ convert::Infallible, + fmt, fs::File, - io::{BufReader, BufWriter}, + io::{BufReader, BufWriter, Read, Seek}, + path::Path, }; use clap::Parser; use cli_args::Args; -use color_eyre::eyre::{self, Context, ContextCompat}; +use color_eyre::eyre; use is_terminal::IsTerminal; +use llm::ggml_format::gguf::{self, MetadataValue}; mod cli_args; mod interactive; @@ -17,7 +20,11 @@ mod util; fn main() -> eyre::Result<()> { tracing_subscriber::fmt() .with_writer(std::io::stderr) - .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .with_env_filter( + tracing_subscriber::EnvFilter::builder() + .with_default_directive(tracing_subscriber::filter::LevelFilter::INFO.into()) + .from_env_lossy(), + ) .with_ansi(std::io::stderr().is_terminal()) .init(); @@ -27,11 +34,11 @@ fn main() -> eyre::Result<()> { match args { Args::Infer(args) => infer(&args), Args::Perplexity(args) => perplexity(&args), - Args::Info(args) => info(&args), + Args::Gguf { gguf: args } => gguf(&args), Args::PromptTokens(args) => prompt_tokens(&args), Args::Repl(args) => interactive::repl(&args), Args::Chat(args) => interactive::chat(&args), - Args::Quantize(args) => quantize(&args), + // Args::Quantize(args) => quantize(&args), } } @@ -127,55 +134,137 @@ fn perplexity(args: &cli_args::Perplexity) -> eyre::Result<()> { Ok(()) } +fn gguf(args: &cli_args::Gguf) -> eyre::Result<()> { + match args { + cli_args::Gguf::Info(args) => info(args), + cli_args::Gguf::Rebuild(args) => rebuild(args), + cli_args::Gguf::AddHfTokenizer(args) => add_hf_tokenizer(args), + } +} + fn info(args: &cli_args::Info) -> eyre::Result<()> { - struct InfoVisitor<'a>(&'a cli_args::Info); - impl llm::ModelArchitectureVisitor> for InfoVisitor<'_> { - fn visit(&mut self) -> eyre::Result<()> { - let args = self.0; - - let model_path = &args.model_and_tokenizer.model_path; - let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?; - - let file = File::open(model_path)?; - let mut reader = BufReader::new(&file); - let mut loader: llm::Loader = - llm::Loader::new(tokenizer, |_| { - // We purposely do not print progress here, as we are only interested in the metadata - }); - - llm::ggml_format::load(&mut reader, &mut loader)?; - - log::info!("Container type: {:?}", loader.container_type); - log::info!("Hyperparameters: {:?}", loader.hyperparameters); - log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len()); - - if args.tokenizer { - log::info!("Tokens:"); - for i in 0..loader.tokenizer.len() { - log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i))); + let model_path = &args.model_and_tokenizer.model_path; + + let file = File::open(model_path)?; + let mut reader = BufReader::new(&file); + let gguf = gguf::Gguf::load(&mut reader)?; + + log::info!("Non-array parameters:"); + for (metadata_key, metadata_value) in gguf.metadata.iter() { + struct ValueDisplay<'a>(Option<&'a MetadataValue>); + impl fmt::Debug for ValueDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(value) = self.0 { + write!(f, "{:?}", value) + } else { + write!(f, "[elided due to size]") } } + } + + let elide_due_to_size = + metadata_value.as_array().is_some() || metadata_key == "tokenizer.huggingface.json"; + + log::info!( + "- {}: {:?}", + metadata_key, + ValueDisplay(if elide_due_to_size { + None + } else { + Some(metadata_value) + }) + ); + } + + if let Ok(tokenizer) = llm::tokenizer::GgufEmbeddedTokenizer::from_metadata(&gguf.metadata) { + log::info!( + "Embedded tokenizer vocabulary size: {}", + tokenizer.tokens.len() + ); + + if args.tokenizer { + log::info!("Embedded tokenizer vocabulary:"); + for (i, token) in tokenizer.tokens.iter().enumerate() { + log::info!("- {}: {}", i, token); + } if args.tensors { log::info!("Tensors:"); - for (name, tensor) in &loader.tensors { - log::info!("- {} ({:?} {:?})", name, tensor.element_type, tensor.dims()); + for (name, tensor) in &gguf.tensor_infos { + log::info!( + "- {} ({:?} {:?})", + name, + tensor.element_type, + tensor.dimensions + ); } } + } + } - fn utf8_or_array(token: &[u8]) -> String { - std::str::from_utf8(token).map_or(format!("{:?}", token), |s| s.to_owned()) - } - - Ok(()) + if args.tensors { + log::info!("Tensors:"); + for (name, tensor) in &gguf.tensor_infos { + log::info!( + "- {} ({:?} {:?}) @ 0x{:X}", + name, + tensor.element_type, + tensor.dimensions, + tensor.offset + ); } } - args.model_and_tokenizer - .architecture - .model_architecture - .wrap_err("a model architecture is required at present")? - .visit(&mut InfoVisitor(args)) + Ok(()) +} + +fn rebuild(args: &cli_args::Rebuild) -> eyre::Result<()> { + rebuild_with_mutation(&args.input, &args.output, |_| Ok(())) +} + +fn add_hf_tokenizer(args: &cli_args::AddHfTokenizer) -> eyre::Result<()> { + let tokenizer = + llm::tokenizer::huggingface_tokenizers::Tokenizer::from_pretrained(&args.tokenizer, None) + .unwrap(); + + rebuild_with_mutation(&args.input, &args.output, move |gguf| { + let tokenizer = tokenizer.to_string(false).unwrap(); + gguf.metadata + .insert("tokenizer.huggingface.json", tokenizer); + + Ok(()) + }) +} + +fn rebuild_with_mutation( + input: &Path, + output: &Path, + mut mutator: impl FnMut(&mut gguf::Gguf) -> eyre::Result<()>, +) -> eyre::Result<()> { + eyre::ensure!(input != output, "input and output must be different files"); + + let input = File::open(input)?; + let mut reader = BufReader::new(&input); + let mut gguf = gguf::Gguf::load(&mut reader)?; + + let mut output = File::create(output)?; + let mut writer = BufWriter::new(&mut output); + + mutator(&mut gguf)?; + gguf.save(&mut writer, |writer, name, _info| { + let reader = &mut reader; + let original_info = gguf.tensor_infos.get(name).unwrap(); + + reader.seek(std::io::SeekFrom::Start( + gguf.tensor_data_position + original_info.offset, + ))?; + + std::io::copy(&mut reader.take(original_info.calc_size() as u64), writer)?; + + Ok(()) + })?; + + Ok(()) } fn prompt_tokens(args: &cli_args::PromptTokens) -> eyre::Result<()> { @@ -207,65 +296,65 @@ fn prompt_tokens(args: &cli_args::PromptTokens) -> eyre::Result<()> { Ok(()) } -fn quantize(args: &cli_args::Quantize) -> eyre::Result<()> { - use llm::QuantizeProgress; - - struct QuantizeVisitor<'a>(&'a cli_args::Quantize); - impl llm::ModelArchitectureVisitor> for QuantizeVisitor<'_> { - fn visit(&mut self) -> eyre::Result<()> { - let args = self.0; - - let mut source: BufReader = BufReader::new(std::fs::File::open(&args.source)?); - let mut destination: BufWriter = - BufWriter::new(std::fs::File::create(&args.destination)?); - let tokenizer: llm::Tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?; - - llm::quantize::( - &mut source, - &mut destination, - tokenizer, - args.container_type.into(), - args.target.into(), - |progress| match progress { - QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"), - QuantizeProgress::TensorLoading { - name, - dims, - element_type, - n_elements, - } => log::info!( - "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)" - ), - QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"), - QuantizeProgress::TensorQuantized { - name, - original_size, - reduced_size, - history, - } => log::info!( - "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})" - ), - QuantizeProgress::TensorSkipped { name, size } => { - log::info!("Skipped tensor `{name}` ({size} bytes)") - } - QuantizeProgress::Finished { - original_size, - reduced_size, - history, - } => log::info!( - "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})" - ), - }, - ) - .wrap_err("failed to quantize model") - } - } - - args.architecture - .model_architecture - .wrap_err("the architecture must be known for quantization")? - .visit(&mut QuantizeVisitor(args)) -} +// fn quantize(args: &cli_args::Quantize) -> eyre::Result<()> { +// use llm::QuantizeProgress; + +// struct QuantizeVisitor<'a>(&'a cli_args::Quantize); +// impl llm::ModelArchitectureVisitor> for QuantizeVisitor<'_> { +// fn visit(&mut self) -> eyre::Result<()> { +// let args = self.0; + +// let mut source: BufReader = BufReader::new(std::fs::File::open(&args.source)?); +// let mut destination: BufWriter = +// BufWriter::new(std::fs::File::create(&args.destination)?); +// let tokenizer: llm::Tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?; + +// llm::quantize::( +// &mut source, +// &mut destination, +// tokenizer, +// args.container_type.into(), +// args.target.into(), +// |progress| match progress { +// QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"), +// QuantizeProgress::TensorLoading { +// name, +// dims, +// element_type, +// n_elements, +// } => log::info!( +// "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)" +// ), +// QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"), +// QuantizeProgress::TensorQuantized { +// name, +// original_size, +// reduced_size, +// history, +// } => log::info!( +// "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})" +// ), +// QuantizeProgress::TensorSkipped { name, size } => { +// log::info!("Skipped tensor `{name}` ({size} bytes)") +// } +// QuantizeProgress::Finished { +// original_size, +// reduced_size, +// history, +// } => log::info!( +// "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})" +// ), +// }, +// ) +// .wrap_err("failed to quantize model") +// } +// } + +// args.architecture +// .model_architecture +// .wrap_err("the architecture must be known for quantization")? +// .visit(&mut QuantizeVisitor(args)) +// } fn load_prompt_file_with_prompt( prompt_file: &cli_args::PromptFile, diff --git a/binaries/llm-test/configs/bloom.json b/binaries/llm-test/configs/bloom.json index 5383386d..cec5e750 100644 --- a/binaries/llm-test/configs/bloom.json +++ b/binaries/llm-test/configs/bloom.json @@ -1,7 +1,6 @@ { "url": "https://huggingface.co/rustformers/bloom-ggml/resolve/main/bloom-560m-q4_0.bin", "filename": "bloom.bin", - "architecture": "bloom", "test_cases": [ { "Inference": { diff --git a/binaries/llm-test/configs/gptj.json b/binaries/llm-test/configs/gptj.json index 50966748..febf76f9 100644 --- a/binaries/llm-test/configs/gptj.json +++ b/binaries/llm-test/configs/gptj.json @@ -1,7 +1,6 @@ { "url": "https://huggingface.co/rustformers/gpt-j-ggml/resolve/main/gpt-j-6b-q4_0-ggjt.bin", "filename": "gptj.bin", - "architecture": "gptj", "test_cases": [ { "Inference": { diff --git a/binaries/llm-test/configs/gptneox.json b/binaries/llm-test/configs/gptneox.json index c8cce4d9..96c58906 100644 --- a/binaries/llm-test/configs/gptneox.json +++ b/binaries/llm-test/configs/gptneox.json @@ -1,7 +1,6 @@ { "url": "https://huggingface.co/rustformers/redpajama-3b-ggml/resolve/main/RedPajama-INCITE-Base-3B-v1-q4_0-ggjt.bin", "filename": "gptneox.bin", - "architecture": "gptneox", "test_cases": [ { "Inference": { diff --git a/binaries/llm-test/configs/llama.json b/binaries/llm-test/configs/llama.json index 9bd6094a..9eec8a73 100644 --- a/binaries/llm-test/configs/llama.json +++ b/binaries/llm-test/configs/llama.json @@ -1,7 +1,6 @@ { "url": "https://huggingface.co/rustformers/open-llama-ggml/resolve/main/open_llama_3b-q4_0-ggjt.bin", "filename": "llama.bin", - "architecture": "llama", "test_cases": [ { "Inference": { diff --git a/binaries/llm-test/configs/mpt.json b/binaries/llm-test/configs/mpt.json index c5d9d8d0..31540573 100644 --- a/binaries/llm-test/configs/mpt.json +++ b/binaries/llm-test/configs/mpt.json @@ -1,7 +1,6 @@ { "url": "https://huggingface.co/rustformers/mpt-7b-ggml/resolve/main/mpt-7b-q4_0-ggjt.bin", "filename": "mpt.bin", - "architecture": "mpt", "test_cases": [ { "Inference": { diff --git a/binaries/llm-test/src/common.rs b/binaries/llm-test/src/common.rs index 4c858820..f910d095 100644 --- a/binaries/llm-test/src/common.rs +++ b/binaries/llm-test/src/common.rs @@ -1,6 +1,8 @@ //! Tests that are run on every model, regardless of config. -pub(super) fn can_send(model: M) -> anyhow::Result { +use llm::Model; + +pub(super) fn can_send(model: Box) -> anyhow::Result> { let model = std::thread::spawn(move || model) .join() .map_err(|e| anyhow::anyhow!("Failed to join thread: {e:?}")); @@ -10,21 +12,21 @@ pub(super) fn can_send(model: M) -> anyhow::Result model } -pub(super) fn can_roundtrip_hyperparameters( - model: &M, -) -> anyhow::Result<()> { - fn test_hyperparameters(hyperparameters: &M) -> anyhow::Result<()> { - let mut data = vec![]; - hyperparameters.write_ggml(&mut data)?; - let new_hyperparameters = - ::read_ggml(&mut std::io::Cursor::new(data))?; +// pub(super) fn can_roundtrip_hyperparameters( +// model: &M, +// ) -> anyhow::Result<()> { +// fn test_hyperparameters(hyperparameters: &M) -> anyhow::Result<()> { +// let mut data = vec![]; +// hyperparameters.write_ggml(&mut data)?; +// let new_hyperparameters = +// ::read_ggml(&mut std::io::Cursor::new(data))?; - assert_eq!(hyperparameters, &new_hyperparameters); +// assert_eq!(hyperparameters, &new_hyperparameters); - log::info!("`can_roundtrip_hyperparameters` test passed!"); +// log::info!("`can_roundtrip_hyperparameters` test passed!"); - Ok(()) - } +// Ok(()) +// } - test_hyperparameters(model.hyperparameters()) -} +// test_hyperparameters(model.hyperparameters()) +// } diff --git a/binaries/llm-test/src/delete.rs b/binaries/llm-test/src/delete.rs index 7bcf81df..d8e40aa8 100644 --- a/binaries/llm-test/src/delete.rs +++ b/binaries/llm-test/src/delete.rs @@ -12,7 +12,7 @@ use serde::Serialize; use crate::{TestCaseReport, TestCaseReportMeta}; /// Tests that models can delete tokens without changing the model's behavior. -pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport { +pub(crate) fn can_delete(model: &dyn Model) -> TestCaseReport { let report = DeleteReport::default(); let mut session = model.start_session(Default::default()); let mut output = OutputRequest { @@ -61,7 +61,7 @@ pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport { fn feed_prompt( prompt: &str, session: &mut InferenceSession, - model: &impl Model, + model: &dyn Model, output: &mut OutputRequest, ) -> Result<(), llm::InferenceError> { session.feed_prompt(model, prompt, output, always_continue) diff --git a/binaries/llm-test/src/main.rs b/binaries/llm-test/src/main.rs index b1bc9b07..0493821f 100644 --- a/binaries/llm-test/src/main.rs +++ b/binaries/llm-test/src/main.rs @@ -18,7 +18,6 @@ use std::{ fs::{self, File}, io::Write, path::{Path, PathBuf}, - str::FromStr, time::Instant, }; @@ -61,7 +60,7 @@ async fn main() -> anyhow::Result<()> { fs::create_dir_all(&results_dir)?; // Load configurations - let test_configs: HashMap = fs::read_dir(configs_dir)? + let mut test_configs: HashMap = fs::read_dir(configs_dir)? .filter_map(Result::ok) .map(|de| de.path()) .filter(|p| p.is_file()) @@ -78,24 +77,20 @@ async fn main() -> anyhow::Result<()> { }; // Test models - let mut test_configs = if let Some(specific_architecture) = specific_model { - vec![test_configs - .get(&specific_architecture) - .with_context(|| { - format!( - "No config found for `{specific_architecture}`. Available configs: {:?}", - test_configs.keys() - ) - })? - .clone()] - } else { - test_configs.values().cloned().collect() - }; - test_configs.sort_by_key(|tc| tc.architecture.clone()); + if let Some(specific_architecture) = specific_model { + test_configs.retain(|k, _| *k == specific_architecture); + } let test_configs_len = test_configs.len(); - for test_config in test_configs { - test_model(&model_config, &test_config, &download_dir, &results_dir).await?; + for (test_name, test_config) in &test_configs { + test_model( + &model_config, + test_name, + test_config, + &download_dir, + &results_dir, + ) + .await?; if test_configs_len > 1 { log::info!("----"); } @@ -114,7 +109,6 @@ struct ModelConfig { struct TestConfig { url: String, filename: PathBuf, - architecture: String, test_cases: Vec, } @@ -165,13 +159,12 @@ pub enum TestCaseReportInner { async fn test_model( model_config: &ModelConfig, + test_name: &str, test_config: &TestConfig, download_dir: &Path, results_dir: &Path, ) -> anyhow::Result<()> { // Load the model - let architecture = llm::ModelArchitecture::from_str(&test_config.architecture)?; - let local_path = if test_config.filename.is_file() { // If this filename points towards a valid file, use it test_config.filename.clone() @@ -180,160 +173,127 @@ async fn test_model( download_dir.join(&test_config.filename) }; - log::info!( - "Testing architecture: `{}` ({})", - test_config.architecture, - local_path.display() - ); + log::info!("Testing `{test_name}`: `{}`", local_path.display()); // Download the model if necessary download_file(&test_config.url, &local_path).await?; - struct TestVisitor<'a> { - model_config: &'a ModelConfig, - test_config: &'a TestConfig, - results_dir: &'a Path, - local_path: &'a Path, - } - impl<'a> llm::ModelArchitectureVisitor> for TestVisitor<'a> { - fn visit(&mut self) -> anyhow::Result<()> { - let Self { - model_config, - test_config, - results_dir, - local_path, - } = *self; - - let start_time = Instant::now(); - - let model = { - let model = llm::load::( - local_path, - llm::TokenizerSource::Embedded, - llm::ModelParameters { - prefer_mmap: model_config.mmap, - ..Default::default() - }, - |progress| { - let print = !matches!(&progress, - llm::LoadProgress::TensorLoaded { current_tensor, tensor_count } - if current_tensor % (tensor_count / 10) != 0 - ); - - if print { - log::info!("loading: {:?}", progress); - } - }, + let start_time = Instant::now(); + + let model = { + let model = llm::load( + &local_path, + llm::TokenizerSource::Embedded, + llm::ModelParameters { + prefer_mmap: model_config.mmap, + ..Default::default() + }, + |progress| { + let print = !matches!(&progress, + llm::LoadProgress::TensorLoaded { current_tensor, tensor_count } + if current_tensor % (tensor_count / 10) != 0 ); - match model { - Ok(m) => m, - Err(err) => { - write_report( - test_config, - results_dir, - &Report::LoadFail { - error: format!("Failed to load model: {}", err), - }, - )?; - - return Err(err.into()); - } - } - }; - - log::info!( - "Model fully loaded! Elapsed: {}ms", - start_time.elapsed().as_millis() - ); - - // - // Non-model-specific tests - // - - // Confirm that the model can be sent to a thread, then sent back - let model = common::can_send(model)?; - - // Confirm that the hyperparameters can be roundtripped - common::can_roundtrip_hyperparameters(&model)?; - - // - - // - // Model-specific tests - // - - // Run the test cases - let mut test_case_reports = vec![]; - for test_case in &test_config.test_cases { - match test_case { - TestCase::Inference { - input, - output, - maximum_token_count, - } => test_case_reports.push(inference::can_infer( - &model, - model_config, - input, - output.as_deref(), - *maximum_token_count, - )?), - TestCase::Tokens { input, output } => { - test_case_reports.push(tokens::can_feed(&model, input, *output)); - } - TestCase::Delete {} => { - test_case_reports.push(delete::can_delete(&model)); - } + if print { + log::info!("loading: {:?}", progress); } + }, + ); + + match model { + Ok(m) => m, + Err(err) => { + write_report( + test_name, + results_dir, + &Report::LoadFail { + error: format!("Failed to load model: {}", err), + }, + )?; + + return Err(err.into()); } - let first_error: Option = - test_case_reports - .iter() - .find_map(|report: &TestCaseReport| match &report.meta { - TestCaseReportMeta::Error { error } => Some(error.clone()), - _ => None, - }); - - // Save the results - // Serialize the report to a JSON string - write_report( - test_config, - results_dir, - &Report::LoadSuccess { - test_cases: test_case_reports, - }, - )?; - - // Optionally, panic if there was an error - if let Some(err) = first_error { - panic!("Error: {}", err); - } + } + }; - log::info!( - "Successfully tested architecture `{}`!", - test_config.architecture - ); + log::info!( + "Model fully loaded! Elapsed: {}ms", + start_time.elapsed().as_millis() + ); + + // + // Non-model-specific tests + // + + // Confirm that the model can be sent to a thread, then sent back + let model = common::can_send(model)?; + + // Confirm that the hyperparameters can be roundtripped + // common::can_roundtrip_hyperparameters(&model)?; + + // - Ok(()) + // + // Model-specific tests + // + + // Run the test cases + let mut test_case_reports = vec![]; + for test_case in &test_config.test_cases { + match test_case { + TestCase::Inference { + input, + output, + maximum_token_count, + } => test_case_reports.push(inference::can_infer( + model.as_ref(), + model_config, + input, + output.as_deref(), + *maximum_token_count, + )?), + TestCase::Tokens { input, output } => { + test_case_reports.push(tokens::can_feed(model.as_ref(), input, *output)); + } + TestCase::Delete {} => { + test_case_reports.push(delete::can_delete(model.as_ref())); + } } } - architecture.visit(&mut TestVisitor { - model_config, - test_config, + let first_error: Option = + test_case_reports + .iter() + .find_map(|report: &TestCaseReport| match &report.meta { + TestCaseReportMeta::Error { error } => Some(error.clone()), + _ => None, + }); + + // Save the results + // Serialize the report to a JSON string + write_report( + test_name, results_dir, - local_path: &local_path, - })?; + &Report::LoadSuccess { + test_cases: test_case_reports, + }, + )?; + + // Optionally, panic if there was an error + if let Some(err) = first_error { + panic!("Error: {}", err); + } + + log::info!( + "Successfully tested `{test_name}`: `{}`!", + local_path.display() + ); Ok(()) } -fn write_report( - test_config: &TestConfig, - results_dir: &Path, - report: &Report, -) -> anyhow::Result<()> { +fn write_report(test_name: &str, results_dir: &Path, report: &Report) -> anyhow::Result<()> { let json_report = serde_json::to_string_pretty(&report)?; - let report_path = results_dir.join(format!("{}.json", test_config.architecture)); + let report_path = results_dir.join(format!("{test_name}.json")); fs::write(report_path, json_report)?; Ok(()) } diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs index adddd678..b9fed471 100644 --- a/binaries/llm-test/src/tokens.rs +++ b/binaries/llm-test/src/tokens.rs @@ -12,7 +12,7 @@ use serde::Serialize; use crate::{TestCaseReport, TestCaseReportMeta}; /// Tests that the model performs as expected when feeding tokens -pub(crate) fn can_feed(model: &impl Model, input: &str, expected_output: usize) -> TestCaseReport { +pub(crate) fn can_feed(model: &dyn Model, input: &str, expected_output: usize) -> TestCaseReport { let mut report = TokensReport::default(); let mut session = model.start_session(Default::default()); let mut output = OutputRequest { @@ -62,7 +62,7 @@ pub(crate) fn can_feed(model: &impl Model, input: &str, expected_output: usize) fn feed_prompt( prompt: &str, session: &mut InferenceSession, - model: &impl Model, + model: &dyn Model, output: &mut OutputRequest, ) -> Result<(), llm::InferenceError> { session.feed_prompt(model, prompt, output, always_continue) diff --git a/crates/ggml/Cargo.toml b/crates/ggml/Cargo.toml index fe60f7a9..108a98e2 100644 --- a/crates/ggml/Cargo.toml +++ b/crates/ggml/Cargo.toml @@ -11,12 +11,15 @@ ggml-sys = { path = "sys", version = "0.2.0-dev" } thiserror = { workspace = true } memmap2 = { workspace = true } +indexmap = { workspace = true } [dev-dependencies] rand = { workspace = true } anyhow = { workspace = true } [features] +# Whether or not the pre-GGUF loading/saving code is exposed. +pre-gguf-formats = [] cublas = ["ggml-sys/cublas"] clblast = ["ggml-sys/clblast"] metal = ["ggml-sys/metal"] diff --git a/crates/ggml/src/format/loader.rs b/crates/ggml/src/format/ggml/loader.rs similarity index 71% rename from crates/ggml/src/format/loader.rs rename to crates/ggml/src/format/ggml/loader.rs index 8b94e6a3..5dbe86aa 100644 --- a/crates/ggml/src/format/loader.rs +++ b/crates/ggml/src/format/ggml/loader.rs @@ -6,67 +6,16 @@ use std::{ error::Error, - fmt, io::{BufRead, Seek, SeekFrom}, }; use crate::{ + format::{data_size, header_size, ContainerType, ContainerTypeReadError}, util::{has_data_left, read_bytes_with_len, read_f32, read_i32, read_u32}, - ContainerType, ElementType, + ElementType, }; -/// Helper struct that wraps the magic number of a file format, -/// so that it can be printed in a human-readable format. -pub struct FormatMagic(pub u32); -impl fmt::Display for FormatMagic { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{:x} ({})", - self.0, - String::from_utf8_lossy(&self.0.to_le_bytes()) - ) - } -} -impl fmt::Debug for FormatMagic { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - fmt::Display::fmt(self, f) - } -} - -#[derive(Debug, thiserror::Error)] -/// Errors that can occur while loading a model. -pub enum LoadError { - #[error("invalid file magic number: {0}")] - /// The file magic number is invalid. - InvalidMagic(FormatMagic), - #[error("invalid ggml format: format={0:?}")] - /// An unsupported format version was found. - InvalidFormatVersion(ContainerType), - #[error("non-specific I/O error")] - /// A non-specific IO error. - Io(#[from] std::io::Error), - #[error("could not convert bytes to a UTF-8 string")] - /// One of the strings encountered was not valid UTF-8. - InvalidUtf8(#[from] std::string::FromUtf8Error), - #[error("invalid integer conversion")] - /// One of the integers encountered could not be converted to a more appropriate type. - InvalidIntegerConversion(#[from] std::num::TryFromIntError), - #[error("implementation error")] - /// An error `E` was returned by the implementation of the loader. - ImplementationError(#[source] E), - #[error("unsupported tensor type {ftype} for tensor {tensor_name}")] - /// One of the tensors encountered had an unsupported data type. - UnsupportedElementType { - /// The name of the tensor. - tensor_name: String, - /// The format type that was encountered. - ftype: u32, - }, - #[error("invariant broken: {0}")] - /// An invariant was broken. - InvariantBroken(String), -} +use super::LoadError; #[derive(Debug, Clone)] /// Information about a [tensor](https://en.wikipedia.org/wiki/Tensor_(machine_learning)) that is being read. @@ -118,21 +67,6 @@ impl TensorLoadInfo { } } -/// Returns the size occupied by a tensor's data in bytes given the element type and number of elements. -pub(crate) fn data_size(element_type: ElementType, n_elements: usize) -> usize { - (crate::type_size(element_type) * n_elements) / crate::blck_size(element_type) -} - -/// Returns the size of the ggml tensor header in bytes. -pub(crate) fn header_size() -> usize { - crate::Tensor::C_TYPE_SIZE + crate::OBJECT_SIZE -} - -/// Returns the size of a tensor in bytes given the element type and number of elements. This includes the tensor's header. -pub fn tensor_size(element_type: ElementType, n_elements: usize) -> usize { - header_size() + data_size(element_type, n_elements) -} - #[derive(Debug, Clone)] /// Information present within GGML [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) /// that is required to continue loading the model. @@ -162,7 +96,10 @@ pub fn load( handler: &mut impl LoadHandler, ) -> Result<(), LoadError> { // Verify magic - let container_type = ContainerType::read(reader)?; + let container_type = ContainerType::read(reader).map_err(|e| match e { + ContainerTypeReadError::InvalidMagic(magic) => LoadError::InvalidMagic(magic), + ContainerTypeReadError::Io(io) => LoadError::Io(io), + })?; match container_type { ContainerType::Ggml @@ -192,6 +129,9 @@ pub fn load( // Legacy model, set empty score 0. } + ContainerType::Gguf(_) => { + unreachable!("This loader should not be used with GGUF") + } }; handler .vocabulary_token(i, token, token_score) @@ -204,6 +144,9 @@ pub fn load( ContainerType::Ggjt(_version) | ContainerType::Ggla(_version) => { load_weights(reader, handler, true) } + ContainerType::Gguf(_) => { + unreachable!("This loader should not be used with GGUF") + } } } diff --git a/crates/ggml/src/format/ggml/mod.rs b/crates/ggml/src/format/ggml/mod.rs new file mode 100644 index 00000000..4a526756 --- /dev/null +++ b/crates/ggml/src/format/ggml/mod.rs @@ -0,0 +1,49 @@ +//! Loading and saving of [GGML](https://github.com/ggerganov/ggml) files. + +mod loader; +mod saver; + +use std::error::Error; + +use super::ContainerType; +use crate::util; + +pub use loader::*; +pub use saver::*; + +#[cfg(test)] +mod tests; + +#[derive(Debug, thiserror::Error)] +/// Errors that can occur while loading a model. +pub enum LoadError { + #[error("invalid file magic value: {0}")] + /// The file's magic value is invalid. + InvalidMagic(util::FileMagic), + #[error("invalid ggml format: format={0:?}")] + /// An unsupported format version was found. + InvalidFormatVersion(ContainerType), + #[error("non-specific I/O error")] + /// A non-specific IO error. + Io(#[from] std::io::Error), + #[error("could not convert bytes to a UTF-8 string")] + /// One of the strings encountered was not valid UTF-8. + InvalidUtf8(#[from] std::string::FromUtf8Error), + #[error("invalid integer conversion")] + /// One of the integers encountered could not be converted to a more appropriate type. + InvalidIntegerConversion(#[from] std::num::TryFromIntError), + #[error("implementation error")] + /// An error `E` was returned by the implementation of the loader. + ImplementationError(#[source] E), + #[error("unsupported tensor type {ftype} for tensor {tensor_name}")] + /// One of the tensors encountered had an unsupported data type. + UnsupportedElementType { + /// The name of the tensor. + tensor_name: String, + /// The format type that was encountered. + ftype: u32, + }, + #[error("invariant broken: {0}")] + /// An invariant was broken. + InvariantBroken(String), +} diff --git a/crates/ggml/src/format/saver.rs b/crates/ggml/src/format/ggml/saver.rs similarity index 98% rename from crates/ggml/src/format/saver.rs rename to crates/ggml/src/format/ggml/saver.rs index 86b4bd24..d8b87a52 100644 --- a/crates/ggml/src/format/saver.rs +++ b/crates/ggml/src/format/ggml/saver.rs @@ -9,7 +9,9 @@ use std::{ io::{Seek, Write}, }; -use crate::{util, ContainerType, ElementType}; +use crate::{util, ElementType}; + +use super::ContainerType; #[derive(Debug, thiserror::Error)] /// Errors that can occur while writing a model. diff --git a/crates/ggml/src/tests.rs b/crates/ggml/src/format/ggml/tests.rs similarity index 83% rename from crates/ggml/src/tests.rs rename to crates/ggml/src/format/ggml/tests.rs index b842f45d..0c535975 100644 --- a/crates/ggml/src/tests.rs +++ b/crates/ggml/src/format/ggml/tests.rs @@ -4,9 +4,12 @@ use std::{ io::{BufRead, Write}, }; -use crate::*; use rand::{distributions::Uniform, prelude::*}; +use crate::format::data_size; + +use super::*; + #[derive(Debug)] struct DummyError; impl std::fmt::Display for DummyError { @@ -25,7 +28,7 @@ fn can_roundtrip_loader_and_saver_ggml() { ("efficient".as_bytes().to_vec(), 0.0), ]; - roundtrip_test(format::SaveContainerType::Ggml, tokenizer).unwrap(); + roundtrip_test(SaveContainerType::Ggml, tokenizer).unwrap(); } #[test] @@ -38,10 +41,10 @@ fn will_fail_on_scored_ggml_save() { ]; assert_eq!( - roundtrip_test(format::SaveContainerType::Ggml, tokenizer) + roundtrip_test(SaveContainerType::Ggml, tokenizer) .unwrap_err() .to_string(), - format::SaveError::::VocabularyScoringNotSupported.to_string() + SaveError::::VocabularyScoringNotSupported.to_string() ); } @@ -54,11 +57,11 @@ fn can_roundtrip_loader_and_saver_ggjt_v3() { ("efficient".as_bytes().to_vec(), 0.4), ]; - roundtrip_test(format::SaveContainerType::GgjtV3, tokenizer).unwrap(); + roundtrip_test(SaveContainerType::GgjtV3, tokenizer).unwrap(); } fn roundtrip_test( - save_container_type: format::SaveContainerType, + save_container_type: SaveContainerType, tokenizer: Vec<(Vec, f32)>, ) -> anyhow::Result<()> { let mut rng = rand::thread_rng(); @@ -79,13 +82,13 @@ fn roundtrip_test( .collect::>(); let n_elements = dims.iter().product::(); - let data = (0..format::data_size(element_type, n_elements)) + let data = (0..data_size(element_type, n_elements)) .map(|_| random()) .collect::>(); ( format!("tensor_{}", i), - format::TensorSaveInfo { + TensorSaveInfo { n_dims, dims: dims.try_into().unwrap(), element_type, @@ -100,7 +103,7 @@ fn roundtrip_test( let mut buffer = Vec::new(); let mut cursor = std::io::Cursor::new(&mut buffer); let mut save_handler = MockSaveHandler { model: &model }; - format::save( + save( &mut cursor, &mut save_handler, save_container_type, @@ -115,7 +118,7 @@ fn roundtrip_test( loaded_model: Model::default(), expected_container_type: save_container_type.into(), }; - format::load(&mut cursor, &mut load_handler)?; + load(&mut cursor, &mut load_handler)?; assert_eq!(load_handler.loaded_model, model); Ok(()) @@ -148,19 +151,19 @@ impl Hyperparameters { struct Model { hyperparameters: Hyperparameters, tokenizer: Vec<(Vec, f32)>, - tensors: BTreeMap, + tensors: BTreeMap, } struct MockSaveHandler<'a> { model: &'a Model, } -impl format::SaveHandler for MockSaveHandler<'_> { +impl SaveHandler for MockSaveHandler<'_> { fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), DummyError> { self.model.hyperparameters.write(writer).unwrap(); Ok(()) } - fn tensor_data(&mut self, tensor_name: &str) -> Result { + fn tensor_data(&mut self, tensor_name: &str) -> Result { self.model .tensors .get(tensor_name) @@ -174,7 +177,7 @@ struct MockLoadHandler<'a> { loaded_model: Model, expected_container_type: ContainerType, } -impl format::LoadHandler for MockLoadHandler<'_> { +impl LoadHandler for MockLoadHandler<'_> { fn container_type(&mut self, container_type: ContainerType) -> Result<(), DummyError> { assert_eq!(container_type, self.expected_container_type); Ok(()) @@ -189,9 +192,9 @@ impl format::LoadHandler for MockLoadHandler<'_> { fn read_hyperparameters( &mut self, reader: &mut dyn BufRead, - ) -> Result { + ) -> Result { self.loaded_model.hyperparameters = Hyperparameters::read(reader).unwrap(); - Ok(format::PartialHyperparameters { + Ok(PartialHyperparameters { n_vocab: self .loaded_model .hyperparameters @@ -201,8 +204,8 @@ impl format::LoadHandler for MockLoadHandler<'_> { }) } - fn tensor_buffer(&mut self, info: format::TensorLoadInfo) -> Result<(), DummyError> { - let data = format::TensorSaveInfo { + fn tensor_buffer(&mut self, info: TensorLoadInfo) -> Result<(), DummyError> { + let data = TensorSaveInfo { n_dims: info.n_dims, dims: info.dims, element_type: info.element_type, diff --git a/crates/ggml/src/format/gguf/metadata.rs b/crates/ggml/src/format/gguf/metadata.rs new file mode 100644 index 00000000..11da1916 --- /dev/null +++ b/crates/ggml/src/format/gguf/metadata.rs @@ -0,0 +1,614 @@ +use std::io::{self, BufRead, Write}; + +use indexmap::IndexMap; +use thiserror::Error; + +use crate::util; + +use super::{GgufContext, GgufLoadError}; + +#[derive(Debug, Clone, PartialEq)] +pub struct Metadata(pub IndexMap); +impl Metadata { + pub fn iter(&self) -> impl Iterator { + self.0.iter() + } + + pub fn keys(&self) -> impl Iterator { + self.0.keys() + } + + pub fn values(&self) -> impl Iterator { + self.0.values() + } + + pub fn get_optional(&self, key: &str) -> Option<&MetadataValue> { + self.0.get(key) + } + + pub fn contains_key(&self, key: &str) -> bool { + self.0.contains_key(key) + } + + pub fn get(&self, key: &str) -> Result<&MetadataValue, MetadataError> { + self.get_optional(key) + .ok_or_else(|| MetadataError::MissingKey { + key: key.to_owned(), + }) + } + + pub fn get_with_type<'a, T: ToMetadataValue>( + &'a self, + key: &'a str, + getter: impl Fn(&MetadataValue) -> Option, + ) -> Result { + let metadata_value = self.get(key)?; + getter(metadata_value).ok_or_else(|| MetadataError::InvalidType { + key: key.to_string(), + expected_type: T::value_type(), + actual_type: metadata_value.value_type(), + }) + } + + pub fn get_with_ref_type<'a, T: ToMetadataValue>( + &'a self, + key: &'a str, + getter: impl Fn(&MetadataValue) -> Option<&T>, + ) -> Result<&'a T, MetadataError> { + let metadata_value = self.get(key)?; + getter(metadata_value).ok_or_else(|| MetadataError::InvalidType { + key: key.to_string(), + expected_type: T::value_type(), + actual_type: metadata_value.value_type(), + }) + } + + pub fn get_array_with_type<'a, T: ToMetadataValue>( + &'a self, + key: &'a str, + getter: impl Fn(&MetadataValue) -> Option<&[T]>, + ) -> Result<&'a [T], MetadataError> { + let metadata_value = self.get(key)?; + getter(metadata_value).ok_or_else(|| MetadataError::InvalidType { + key: key.to_string(), + expected_type: T::value_type(), + actual_type: metadata_value.value_type(), + }) + } + + // TODO: consider finding a way to automate getting with traits + pub fn get_str(&self, key: &str) -> Result<&str, MetadataError> { + let metadata_value = self.get(key)?; + metadata_value + .as_string() + .ok_or_else(|| MetadataError::InvalidType { + key: key.to_string(), + expected_type: MetadataValueType::String, + actual_type: metadata_value.value_type(), + }) + } + + pub fn get_countable(&self, key: &str) -> Result { + let metadata_value = self.get(key)?; + match metadata_value { + MetadataValue::UInt32(v) => Ok(usize::try_from(*v)?), + MetadataValue::UInt64(v) => Ok(usize::try_from(*v)?), + _ => Err(MetadataError::InvalidType { + key: key.to_string(), + expected_type: MetadataValueType::UInt64, + actual_type: metadata_value.value_type(), + }), + } + } + + pub fn insert(&mut self, key: &str, value: T) { + self.0.insert(key.to_owned(), value.to_value()); + } +} + +#[repr(u32)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum MetadataValueType { + /// The value is a 8-bit unsigned integer. + UInt8 = 0, + /// The value is a 8-bit signed integer. + Int8 = 1, + /// The value is a 16-bit unsigned little-endian integer. + UInt16 = 2, + /// The value is a 16-bit signed little-endian integer. + Int16 = 3, + /// The value is a 32-bit unsigned little-endian integer. + UInt32 = 4, + /// The value is a 32-bit signed little-endian integer. + Int32 = 5, + /// The value is a 32-bit IEEE754 floating point number. + Float32 = 6, + /// The value is a boolean. + /// 1-byte value where 0 is false and 1 is true. + /// Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy. + Bool = 7, + /// The value is a UTF-8 non-null-terminated string, with length prepended. + String = 8, + /// The value is an array of other values, with the length and type prepended. + /// + /// Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes. + Array = 9, + /// The value is a 64-bit unsigned little-endian integer. + /// Implemented in GGUFv2. + UInt64 = 10, + /// The value is a 64-bit signed little-endian integer. + /// Implemented in GGUFv2. + Int64 = 11, + /// The value is a 64-bit IEEE754 floating point number. + /// Implemented in GGUFv2. + Float64 = 12, +} +pub trait ToMetadataValue { + fn value_type() -> MetadataValueType; + fn to_value(self) -> MetadataValue; +} +pub trait ToMetadataArrayValue { + fn to_array_value(self) -> MetadataArrayValue; +} +macro_rules! impl_value_boilerplate { + ($($value_type:ident($rust_type:ty)),*) => { + $( + impl ToMetadataValue for $rust_type { + fn value_type() -> MetadataValueType { + MetadataValueType::$value_type + } + + fn to_value(self) -> MetadataValue { + MetadataValue::$value_type(self) + } + } + + impl ToMetadataArrayValue for Vec<$rust_type> { + fn to_array_value(self) -> MetadataArrayValue { + MetadataArrayValue::$value_type(self) + } + } + )* + + impl TryFrom for MetadataValueType { + type Error = (); + + fn try_from(value: u32) -> Result { + for test_value in [ + $(MetadataValueType::$value_type),* + ] { + if value == test_value as u32 { + return Ok(test_value); + } + } + Err(()) + } + } + impl MetadataValueType { + fn read_value( + self, + ctx: &GgufContext, + reader: &mut dyn BufRead, + ) -> Result { + use MetadataValueType as MVT; + + Ok(match self { + $(MVT::$value_type => <$rust_type>::read(ctx, reader)?.to_value(),)* + }) + } + } + + #[derive(Debug, Clone, PartialEq)] + pub enum MetadataValue { + $( + $value_type($rust_type), + )* + } + impl MetadataValue { + pub fn value_type(&self) -> MetadataValueType { + match self { + $(MetadataValue::$value_type(_) => MetadataValueType::$value_type),* + } + } + + fn write(&self, ctx: &GgufContext, writer: &mut dyn Write) -> io::Result<()> { + match self { + $(MetadataValue::$value_type(v) => v.write(ctx, writer)),* + } + } + } + + #[derive(Debug, Clone, PartialEq)] + pub enum MetadataArrayValue { + $($value_type(Vec<$rust_type>),)* + } + impl MetadataArrayValue { + /// Returns the length of the array. + pub fn len(&self) -> usize { + match self { + $(Self::$value_type(v) => v.len(),)* + } + } + } + }; +} +impl_value_boilerplate! { + UInt8(u8), + Int8(i8), + UInt16(u16), + Int16(i16), + UInt32(u32), + Int32(i32), + Float32(f32), + Bool(bool), + String(String), + Array(MetadataArrayValue), + UInt64(u64), + Int64(i64), + Float64(f64) +} + +// Public +impl MetadataValue { + pub fn as_uint8(&self) -> Option { + match self { + Self::UInt8(v) => Some(*v), + _ => None, + } + } + + pub fn as_int8(&self) -> Option { + match self { + Self::Int8(v) => Some(*v), + _ => None, + } + } + + pub fn as_uint16(&self) -> Option { + match self { + Self::UInt16(v) => Some(*v), + _ => None, + } + } + + pub fn as_int16(&self) -> Option { + match self { + Self::Int16(v) => Some(*v), + _ => None, + } + } + + pub fn as_uint32(&self) -> Option { + match self { + Self::UInt32(v) => Some(*v), + _ => None, + } + } + + pub fn as_int32(&self) -> Option { + match self { + Self::Int32(v) => Some(*v), + _ => None, + } + } + + pub fn as_float32(&self) -> Option { + match self { + Self::Float32(v) => Some(*v), + _ => None, + } + } + + pub fn as_bool(&self) -> Option { + match self { + Self::Bool(v) => Some(*v), + _ => None, + } + } + + pub fn as_string(&self) -> Option<&str> { + match self { + Self::String(v) => Some(v), + _ => None, + } + } + + pub fn as_array(&self) -> Option<&MetadataArrayValue> { + match self { + Self::Array(v) => Some(v), + _ => None, + } + } + + pub fn as_uint64(&self) -> Option { + match self { + Self::UInt64(v) => Some(*v), + _ => None, + } + } + + pub fn as_int64(&self) -> Option { + match self { + Self::Int64(v) => Some(*v), + _ => None, + } + } + + pub fn as_float64(&self) -> Option { + match self { + Self::Float64(v) => Some(*v), + _ => None, + } + } +} +impl MetadataValue { + pub(super) fn read_key_value( + ctx: &GgufContext, + reader: &mut dyn BufRead, + ) -> Result<(String, Self), GgufLoadError> { + let key = util::read_string(reader, ctx.use_64_bit_length)?; + let value_type = MetadataValueType::try_from(util::read_u32(reader)?) + .expect("TODO: handle invalid value types"); + let value = value_type.read_value(ctx, reader)?; + + Ok((key, value)) + } + + pub(super) fn write_key_value( + &self, + ctx: &GgufContext, + writer: &mut dyn Write, + key: &str, + ) -> io::Result<()> { + util::write_string(writer, ctx.use_64_bit_length, key)?; + util::write_u32(writer, self.value_type() as u32)?; + self.write(ctx, writer)?; + + Ok(()) + } +} + +// Public +impl MetadataArrayValue { + pub fn as_uint8_array(&self) -> Option<&[u8]> { + match self { + Self::UInt8(v) => Some(v), + _ => None, + } + } + + pub fn as_int8_array(&self) -> Option<&[i8]> { + match self { + Self::Int8(v) => Some(v), + _ => None, + } + } + + pub fn as_uint16_array(&self) -> Option<&[u16]> { + match self { + Self::UInt16(v) => Some(v), + _ => None, + } + } + + pub fn as_int16_array(&self) -> Option<&[i16]> { + match self { + Self::Int16(v) => Some(v), + _ => None, + } + } + + pub fn as_uint32_array(&self) -> Option<&[u32]> { + match self { + Self::UInt32(v) => Some(v), + _ => None, + } + } + + pub fn as_int32_array(&self) -> Option<&[i32]> { + match self { + Self::Int32(v) => Some(v), + _ => None, + } + } + + pub fn as_float32_array(&self) -> Option<&[f32]> { + match self { + Self::Float32(v) => Some(v), + _ => None, + } + } + + pub fn as_bool_array(&self) -> Option<&[bool]> { + match self { + Self::Bool(v) => Some(v), + _ => None, + } + } + + pub fn as_string_array(&self) -> Option<&[String]> { + match self { + Self::String(v) => Some(v), + _ => None, + } + } + + pub fn as_array_array(&self) -> Option<&[MetadataArrayValue]> { + match self { + Self::Array(v) => Some(v), + _ => None, + } + } + + pub fn as_uint64_array(&self) -> Option<&[u64]> { + match self { + Self::UInt64(v) => Some(v), + _ => None, + } + } + + pub fn as_int64_array(&self) -> Option<&[i64]> { + match self { + Self::Int64(v) => Some(v), + _ => None, + } + } + + pub fn as_float64_array(&self) -> Option<&[f64]> { + match self { + Self::Float64(v) => Some(v), + _ => None, + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +// Shared +trait ValueIO { + fn read(ctx: &GgufContext, reader: &mut dyn BufRead) -> Result + where + Self: Sized; + fn write(&self, ctx: &GgufContext, writer: &mut dyn Write) -> io::Result<()>; +} +macro_rules! impl_value_io_boilerplate { + ($($value_type:ident($rust_type:ty, $read_method:ident, $write_method:ident)),*) => { + $( + impl ValueIO for $rust_type { + fn read(_ctx: &GgufContext, reader: &mut dyn BufRead) -> Result + where + Self: Sized, + { + Ok(util::$read_method(reader)?) + } + + fn write(&self, _ctx: &GgufContext, writer: &mut dyn Write) -> io::Result<()> { + util::$write_method(writer, *self) + } + } + )* + }; +} +impl_value_io_boilerplate! { + UInt8(u8, read_u8, write_u8), + Int8(i8, read_i8, write_i8), + UInt16(u16, read_u16, write_u16), + Int16(i16, read_i16, write_i16), + UInt32(u32, read_u32, write_u32), + Int32(i32, read_i32, write_i32), + Float32(f32, read_f32, write_f32), + Bool(bool, read_bool, write_bool), + UInt64(u64, read_u64, write_u64), + Int64(i64, read_i64, write_i64), + Float64(f64, read_f64, write_f64) +} +impl ValueIO for String { + fn read(ctx: &GgufContext, reader: &mut dyn BufRead) -> Result + where + Self: Sized, + { + Ok(util::read_string(reader, ctx.use_64_bit_length)?) + } + + fn write(&self, ctx: &GgufContext, writer: &mut dyn Write) -> io::Result<()> { + util::write_string(writer, ctx.use_64_bit_length, self) + } +} +impl ValueIO for MetadataArrayValue { + fn read(ctx: &GgufContext, reader: &mut dyn BufRead) -> Result + where + Self: Sized, + { + let value_type = MetadataValueType::try_from(util::read_u32(reader)?) + .expect("TODO: handle invalid value types"); + let length = util::read_length(reader, ctx.use_64_bit_length)?; + + use MetadataValueType as MVT; + return match value_type { + MVT::UInt8 => read_array::(ctx, reader, length), + MVT::Int8 => read_array::(ctx, reader, length), + MVT::UInt16 => read_array::(ctx, reader, length), + MVT::Int16 => read_array::(ctx, reader, length), + MVT::UInt32 => read_array::(ctx, reader, length), + MVT::Int32 => read_array::(ctx, reader, length), + MVT::Float32 => read_array::(ctx, reader, length), + MVT::Bool => read_array::(ctx, reader, length), + MVT::String => read_array::(ctx, reader, length), + MVT::Array => read_array::(ctx, reader, length), + MVT::UInt64 => read_array::(ctx, reader, length), + MVT::Int64 => read_array::(ctx, reader, length), + MVT::Float64 => read_array::(ctx, reader, length), + }; + + fn read_array( + ctx: &GgufContext, + reader: &mut dyn BufRead, + length: usize, + ) -> Result + where + Vec: ToMetadataArrayValue, + { + (0..length) + .map(|_| T::read(ctx, reader)) + .collect::, _>>() + .map(|v| v.to_array_value()) + } + } + + fn write(&self, ctx: &GgufContext, writer: &mut dyn Write) -> io::Result<()> { + return match self { + MetadataArrayValue::UInt8(v) => write_array(ctx, writer, v), + MetadataArrayValue::Int8(v) => write_array(ctx, writer, v), + MetadataArrayValue::UInt16(v) => write_array(ctx, writer, v), + MetadataArrayValue::Int16(v) => write_array(ctx, writer, v), + MetadataArrayValue::UInt32(v) => write_array(ctx, writer, v), + MetadataArrayValue::Int32(v) => write_array(ctx, writer, v), + MetadataArrayValue::Float32(v) => write_array(ctx, writer, v), + MetadataArrayValue::Bool(v) => write_array(ctx, writer, v), + MetadataArrayValue::String(v) => write_array(ctx, writer, v), + MetadataArrayValue::Array(v) => write_array(ctx, writer, v), + MetadataArrayValue::UInt64(v) => write_array(ctx, writer, v), + MetadataArrayValue::Int64(v) => write_array(ctx, writer, v), + MetadataArrayValue::Float64(v) => write_array(ctx, writer, v), + }; + + fn write_array( + ctx: &GgufContext, + writer: &mut dyn Write, + array: &[T], + ) -> io::Result<()> { + util::write_u32(writer, T::value_type() as u32)?; + util::write_length(writer, ctx.use_64_bit_length, array.len())?; + for value in array { + value.write(ctx, writer)?; + } + Ok(()) + } + } +} + +#[derive(Error, Debug)] +/// Errors encountered during the loading process. +pub enum MetadataError { + /// The model expected a metadata key-value pair, but the key was missing. + #[error("missing metadata key {key:?}")] + MissingKey { + /// The key that was missing. + key: String, + }, + /// The metadata key-value pair was not of the expected type. + #[error("metadata key {key:?} was not of the expected type")] + InvalidType { + /// The key with the invalid type. + key: String, + /// The expected type. + expected_type: MetadataValueType, + /// The actual type. + actual_type: MetadataValueType, + }, + #[error("invalid integer conversion")] + /// One of the integers encountered could not be converted to a more appropriate type. + InvalidIntegerConversion(#[from] std::num::TryFromIntError), +} diff --git a/crates/ggml/src/format/gguf/mod.rs b/crates/ggml/src/format/gguf/mod.rs new file mode 100644 index 00000000..dc2ab9dd --- /dev/null +++ b/crates/ggml/src/format/gguf/mod.rs @@ -0,0 +1,275 @@ +#![allow(missing_docs)] + +use std::io::{BufRead, BufWriter, Seek, Write}; + +use super::{data_size, header_size, ContainerType, ContainerTypeReadError}; +use crate::{util, ElementType}; + +use ggml_sys::ggml_type; +use indexmap::IndexMap; +use thiserror::Error; + +mod metadata; +pub use metadata::*; + +pub const DEFAULT_ALIGNMENT: u32 = 32; +pub const META_TENSOR_DATA_LAYOUT: &str = "Meta AI original pth"; + +#[derive(Debug, Error)] +/// Errors that can occur while loading a model. +pub enum GgufLoadError { + #[error("invalid GGUF file magic value: {0}")] + /// The file magic number is invalid. + InvalidMagic(util::FileMagic), + #[error("invalid ggml format: format={0:?}")] + /// An unsupported format version was found. + InvalidFormatVersion(ContainerType), + #[error("non-specific I/O error")] + /// A non-specific IO error. + Io(#[from] std::io::Error), + #[error("could not convert bytes to a UTF-8 string")] + /// One of the strings encountered was not valid UTF-8. + InvalidUtf8(#[from] std::string::FromUtf8Error), + #[error("invalid integer conversion")] + /// One of the integers encountered could not be converted to a more appropriate type. + InvalidIntegerConversion(#[from] std::num::TryFromIntError), + #[error("unsupported tensor type {ftype} for tensor {tensor_name}")] + /// One of the tensors encountered had an unsupported data type. + UnsupportedElementType { + /// The name of the tensor. + tensor_name: String, + /// The format type that was encountered. + ftype: u32, + }, +} + +#[derive(Debug, Error)] +/// Errors that can occur while saving a model. +pub enum GgufSaveError { + // TODO! +} + +pub type TensorInfos = IndexMap; + +#[derive(Debug, Clone, PartialEq)] +pub struct Gguf { + pub metadata: Metadata, + pub tensor_infos: TensorInfos, + pub tensor_data_position: u64, +} +impl Gguf { + pub fn load(reader: &mut R) -> Result { + let container = ContainerType::read(reader).map_err(|e| match e { + ContainerTypeReadError::InvalidMagic(magic) => GgufLoadError::InvalidMagic(magic), + ContainerTypeReadError::Io(io) => GgufLoadError::Io(io), + })?; + if ![ + ContainerType::Gguf(1), + ContainerType::Gguf(2), + ContainerType::Gguf(3), + ] + .contains(&container) + { + return Err(GgufLoadError::InvalidFormatVersion(container)); + } + + let ctx = GgufContext { + use_64_bit_length: container == ContainerType::Gguf(2) + || container == ContainerType::Gguf(3), + }; + + let tensor_count = util::read_length(reader, ctx.use_64_bit_length)?; + let metadata_kv_count = util::read_length(reader, ctx.use_64_bit_length)?; + + let mut metadata = IndexMap::with_capacity(metadata_kv_count); + for _ in 0..metadata_kv_count { + let (key, value) = MetadataValue::read_key_value(&ctx, reader)?; + metadata.insert(key, value); + } + let metadata = Metadata(metadata); + + let alignment = metadata + .get_optional("general.alignment") + .and_then(|v| v.as_uint32()) + .unwrap_or(DEFAULT_ALIGNMENT) as u64; + + let mut tensor_infos = IndexMap::with_capacity(tensor_count); + for _ in 0..tensor_count { + let (key, value) = TensorInfo::read_name_value(&ctx, reader)?; + tensor_infos.insert(key, value); + } + + let tensor_data_position = align_offset(reader.stream_position()?, alignment); + + Ok(Gguf { + metadata, + tensor_infos, + tensor_data_position, + }) + } + + /// Saves the GGUF file to the given writer. + /// + /// `get_tensor_size` is a function that returns the size of a tensor's data in bytes. + /// `write_tensor_data` is a function that writes the tensor's data to the writer; the data + /// must be the same length as the value returned by `get_tensor_size`. + /// + /// The `offset` in `TensorInfo` will be ignored and the correct offset will be calculated + /// automatically. + pub fn save( + &self, + writer: &mut BufWriter, + mut write_tensor_data: impl FnMut(&mut BufWriter, &str, &TensorInfo) -> std::io::Result<()>, + ) -> std::io::Result<()> { + // Write header + let container = ContainerType::Gguf(2); + container.write(writer)?; + + let ctx = GgufContext { + use_64_bit_length: true, + }; + + util::write_length(writer, ctx.use_64_bit_length, self.tensor_infos.len())?; + util::write_length(writer, ctx.use_64_bit_length, self.metadata.0.len())?; + + // Write metadata + for (key, value) in &self.metadata.0 { + value.write_key_value(&ctx, writer, key)?; + } + + // Write tensor infos + let alignment = self + .metadata + .get_optional("general.alignment") + .and_then(|v| v.as_uint32()) + .unwrap_or(DEFAULT_ALIGNMENT) as u64; + + // Pre-plan the write before writing the tensor data. + #[derive(Debug)] + struct TensorWrite { + name: String, + info: TensorInfo, + size: usize, + } + let mut tensors = vec![]; + let mut next_offset = 0; + for (name, tensor_info) in &self.tensor_infos { + let size = tensor_info.calc_size(); + tensors.push(TensorWrite { + name: name.clone(), + info: TensorInfo { + offset: next_offset, + ..tensor_info.clone() + }, + size, + }); + + next_offset = align_offset(next_offset + size as u64, alignment); + } + + for write in &tensors { + write.info.write_name_value(&ctx, writer, &write.name)?; + } + + // Write tensors + let stream_position = writer.stream_position()?; + let tensor_data_position = align_offset(stream_position, alignment); + assert!(tensor_data_position > stream_position); + util::write_zero_bytes(writer, (tensor_data_position - stream_position) as usize)?; + + for write in &tensors { + write_tensor_data(writer, &write.name, &write.info)?; + + let stream_position = writer.stream_position()?; + assert!( + stream_position == tensor_data_position + write.info.offset + write.size as u64 + ); + let next_position = align_offset(stream_position, alignment); + util::write_zero_bytes(writer, (next_position - stream_position) as usize)?; + } + + Ok(()) + } +} + +fn align_offset(offset: u64, alignment: u64) -> u64 { + offset + (alignment - (offset % alignment)) % alignment +} + +struct GgufContext { + use_64_bit_length: bool, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct TensorInfo { + pub dimensions: Vec, + pub element_type: ElementType, + /// This offset is relative to `tensor_data`, not to the start + /// of the file, to make it easier for writers to write the file. + pub offset: u64, +} +impl TensorInfo { + fn read_name_value( + ctx: &GgufContext, + reader: &mut dyn BufRead, + ) -> Result<(String, Self), GgufLoadError> { + let name = util::read_string(reader, ctx.use_64_bit_length)?; + + let dimension_count = util::read_u32(reader)? as usize; + let dimensions = (0..dimension_count) + .map(|_| util::read_length(reader, ctx.use_64_bit_length)) + .collect::, _>>()?; + + let element_type = util::read_u32(reader)?; + let element_type = ElementType::try_from(element_type).map_err(|_| { + GgufLoadError::UnsupportedElementType { + tensor_name: name.clone(), + ftype: element_type, + } + })?; + + let offset = util::read_u64(reader)?; + + Ok(( + name, + Self { + dimensions, + element_type, + offset, + }, + )) + } + + fn write_name_value( + &self, + ctx: &GgufContext, + writer: &mut dyn Write, + name: &str, + ) -> std::io::Result<()> { + util::write_string(writer, ctx.use_64_bit_length, name)?; + + util::write_u32(writer, self.dimensions.len().try_into().unwrap())?; + for dimension in &self.dimensions { + util::write_length(writer, ctx.use_64_bit_length, *dimension)?; + } + + util::write_u32(writer, ggml_type::from(self.element_type))?; + util::write_u64(writer, self.offset)?; + + Ok(()) + } + + /// Calculate the size of the tensor's values in bytes. + pub fn calc_size(&self) -> usize { + data_size(self.element_type, self.dimensions.iter().product()) + } + + /// Calculates the absolute size in bytes of the tensor's data, given the mmap flag. + pub fn calc_absolute_size(&self, mmap: bool) -> usize { + if mmap { + header_size() + } else { + header_size() + self.calc_size() + } + } +} diff --git a/crates/ggml/src/format/mod.rs b/crates/ggml/src/format/mod.rs index f1a939b7..7cc1af73 100644 --- a/crates/ggml/src/format/mod.rs +++ b/crates/ggml/src/format/mod.rs @@ -1,7 +1,127 @@ -//! Loading and saving of [GGML](https://github.com/ggerganov/ggml) files. +//! Loading and saving of GGML-related files. -mod loader; -mod saver; +use thiserror::Error; -pub use loader::*; -pub use saver::*; +use crate::{util, ElementType}; + +#[cfg(feature = "pre-gguf-formats")] +pub mod ggml; +pub mod gguf; + +/// Magic constant for `ggml` files (unversioned). +pub const FILE_MAGIC_GGML: [u8; 4] = *b"lmgg"; +/// Magic constant for `ggml` files (versioned, ggmf). +pub const FILE_MAGIC_GGMF: [u8; 4] = *b"fmgg"; +/// Magic constant for `ggml` files (versioned, ggjt). +pub const FILE_MAGIC_GGJT: [u8; 4] = *b"tjgg"; +/// Magic constant for `ggla` files (LoRA adapter). +pub const FILE_MAGIC_GGLA: [u8; 4] = *b"algg"; +/// Magic constant for `gguf` files. +pub const FILE_MAGIC_GGUF: [u8; 4] = *b"GGUF"; + +/// Errors that can occur while reading the container type. +#[derive(Debug, Error)] +pub enum ContainerTypeReadError { + /// The magic value was invalid. + #[error("invalid magic value: {0}")] + InvalidMagic(util::FileMagic), + /// An I/O error occurred. + #[error("I/O error")] + Io(#[from] std::io::Error), +} + +#[derive(Debug, PartialEq, Clone, Copy)] +/// The format of the file containing the model. +pub enum ContainerType { + /// Legacy format, oldest ggml tensor file format + Ggml, + /// Legacy format. Introduces versioning. Newer than GGML, older than GGJT. + Ggmf(u32), + /// [mmap](https://en.wikipedia.org/wiki/Mmap)-able format. + Ggjt(u32), + /// LoRA adapter format. + Ggla(u32), + /// GGUF format. Current version of the format. + Gguf(u32), +} +impl ContainerType { + /// Does this container type support mmap? + pub fn support_mmap(&self) -> bool { + match self { + ContainerType::Ggml => false, + ContainerType::Ggmf(_) => false, + ContainerType::Ggla(_) => false, + ContainerType::Ggjt(_) => true, + ContainerType::Gguf(_) => true, + } + } + + /// Read the container type from a reader. + pub fn read(reader: &mut dyn std::io::BufRead) -> Result { + // Verify magic + let magic = util::read_bytes::<4>(reader)?; + let container_type: ContainerType = match magic { + FILE_MAGIC_GGML => ContainerType::Ggml, + FILE_MAGIC_GGMF => { + let version = util::read_u32(reader)?; + ContainerType::Ggmf(version) + } + FILE_MAGIC_GGJT => { + let version = util::read_u32(reader)?; + ContainerType::Ggjt(version) + } + FILE_MAGIC_GGLA => { + let version = util::read_u32(reader)?; + ContainerType::Ggla(version) + } + FILE_MAGIC_GGUF => { + let version = util::read_u32(reader)?; + ContainerType::Gguf(version) + } + magic => return Err(ContainerTypeReadError::InvalidMagic(util::FileMagic(magic))), + }; + + Ok(container_type) + } + + /// Write the container type to a writer. + pub fn write(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { + match self { + ContainerType::Ggml => { + writer.write_all(&FILE_MAGIC_GGML)?; + } + ContainerType::Ggmf(version) => { + writer.write_all(&FILE_MAGIC_GGMF)?; + util::write_u32(writer, *version)?; + } + ContainerType::Ggjt(version) => { + writer.write_all(&FILE_MAGIC_GGJT)?; + util::write_u32(writer, *version)?; + } + ContainerType::Ggla(version) => { + writer.write_all(&FILE_MAGIC_GGLA)?; + util::write_u32(writer, *version)?; + } + ContainerType::Gguf(version) => { + writer.write_all(&FILE_MAGIC_GGUF)?; + util::write_u32(writer, *version)?; + } + } + Ok(()) + } +} + +/// Returns the size occupied by a tensor's data in bytes given the element type and number of elements. +pub(crate) fn data_size(element_type: ElementType, n_elements: usize) -> usize { + (crate::type_size(element_type) * n_elements) / crate::blck_size(element_type) +} + +/// Returns the size of the ggml tensor header in bytes. +pub(crate) fn header_size() -> usize { + crate::Tensor::C_TYPE_SIZE + crate::OBJECT_SIZE +} + +/// Returns the size of a tensor in bytes given the element type and number of elements. This includes the tensor's header. +pub fn tensor_size(element_type: ElementType, n_elements: usize) -> usize { + header_size() + data_size(element_type, n_elements) +} diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 26bcc548..fc99a0c0 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -28,97 +28,9 @@ pub use tensor::Tensor; pub use ggml_sys as sys; -#[cfg(test)] -mod tests; - /// The type of a tensor element. pub type ElementType = Type; -#[derive(Debug, PartialEq, Clone, Copy)] -/// The format of the file containing the model. -pub enum ContainerType { - /// Legacy format, oldest ggml tensor file format - Ggml, - /// Legacy format. Introduces versioning. Newer than GGML, older than GGJT. - Ggmf(u32), - /// [mmap](https://en.wikipedia.org/wiki/Mmap)-able format. Current version of the format. - Ggjt(u32), - /// LoRA adapter format. - Ggla(u32), -} -impl ContainerType { - /// Does this container type support mmap? - pub fn support_mmap(&self) -> bool { - match self { - ContainerType::Ggml => false, - ContainerType::Ggmf(_) => false, - ContainerType::Ggla(_) => false, - ContainerType::Ggjt(_) => true, - } - } - - /// Read the container type from a reader. - pub fn read( - reader: &mut dyn std::io::BufRead, - ) -> Result> { - // Verify magic - let magic = util::read_u32(reader)?; - let container_type: ContainerType = match magic { - crate::FILE_MAGIC_GGML => ContainerType::Ggml, - crate::FILE_MAGIC_GGMF => { - let version = util::read_u32(reader)?; - ContainerType::Ggmf(version) - } - crate::FILE_MAGIC_GGJT => { - let version = util::read_u32(reader)?; - ContainerType::Ggjt(version) - } - crate::FILE_MAGIC_GGLA => { - let version = util::read_u32(reader)?; - ContainerType::Ggla(version) - } - magic => { - return Err(crate::format::LoadError::InvalidMagic(format::FormatMagic( - magic, - ))) - } - }; - - Ok(container_type) - } - - /// Write the container type to a writer. - pub fn write(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { - match self { - ContainerType::Ggml => { - util::write_u32(writer, FILE_MAGIC_GGML)?; - } - ContainerType::Ggmf(version) => { - util::write_u32(writer, FILE_MAGIC_GGMF)?; - util::write_u32(writer, *version)?; - } - ContainerType::Ggjt(version) => { - util::write_u32(writer, FILE_MAGIC_GGJT)?; - util::write_u32(writer, *version)?; - } - ContainerType::Ggla(version) => { - util::write_u32(writer, FILE_MAGIC_GGLA)?; - util::write_u32(writer, *version)?; - } - } - Ok(()) - } -} - -/// Magic constant for `ggml` files (unversioned). -pub const FILE_MAGIC_GGML: u32 = 0x67676d6c; -/// Magic constant for `ggml` files (versioned, ggmf). -pub const FILE_MAGIC_GGMF: u32 = 0x67676d66; -/// Magic constant for `ggml` files (versioned, ggjt). -pub const FILE_MAGIC_GGJT: u32 = 0x67676a74; -/// Magic constant for `ggla` files (LoRA adapter). -pub const FILE_MAGIC_GGLA: u32 = 0x67676C61; - /// The current quantization version. pub const QNT_VERSION: u32 = sys::GGML_QNT_VERSION; /// The factor by which to divide `ftype` to determine the current quantization version. @@ -159,7 +71,7 @@ impl Default for RoPEOverrides { } } -#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, PartialOrd, Ord)] /// The type of a value in `ggml`. pub enum Type { /// Quantized 4-bit (type 0). diff --git a/crates/ggml/src/util.rs b/crates/ggml/src/util.rs index 69b20de4..bdddbeca 100644 --- a/crates/ggml/src/util.rs +++ b/crates/ggml/src/util.rs @@ -1,75 +1,222 @@ //! Utilities for reading and writing. -use std::io::{BufRead, Write}; +use std::{ + fmt, + io::{self, BufRead, Write}, +}; + +/// Helper struct that wraps the magic number of a file format, +/// so that it can be printed in a human-readable format. +pub struct FileMagic(pub [u8; 4]); +impl fmt::Display for FileMagic { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:x?} ({})", self.0, String::from_utf8_lossy(&self.0)) + } +} +impl fmt::Debug for FileMagic { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// +/// READERS +/// /// Read a fixed-size array of bytes from a reader. -pub fn read_bytes(reader: &mut dyn BufRead) -> Result<[u8; N], std::io::Error> { +pub fn read_bytes(reader: &mut dyn BufRead) -> io::Result<[u8; N]> { let mut bytes = [0u8; N]; reader.read_exact(&mut bytes)?; Ok(bytes) } +/// Read a `i8` from a reader. +pub fn read_i8(reader: &mut dyn BufRead) -> io::Result { + Ok(i8::from_le_bytes(read_bytes::<1>(reader)?)) +} + +/// Read a `u8` from a reader. +pub fn read_u8(reader: &mut dyn BufRead) -> io::Result { + Ok(u8::from_le_bytes(read_bytes::<1>(reader)?)) +} + +/// Read a `i16` from a reader. +pub fn read_i16(reader: &mut dyn BufRead) -> io::Result { + Ok(i16::from_le_bytes(read_bytes::<2>(reader)?)) +} + +/// Read a `u16` from a reader. +pub fn read_u16(reader: &mut dyn BufRead) -> io::Result { + Ok(u16::from_le_bytes(read_bytes::<2>(reader)?)) +} + /// Read a `i32` from a reader. -pub fn read_i32(reader: &mut dyn BufRead) -> Result { +pub fn read_i32(reader: &mut dyn BufRead) -> io::Result { Ok(i32::from_le_bytes(read_bytes::<4>(reader)?)) } /// Read a `u32` from a reader. -pub fn read_u32(reader: &mut dyn BufRead) -> Result { +pub fn read_u32(reader: &mut dyn BufRead) -> io::Result { Ok(u32::from_le_bytes(read_bytes::<4>(reader)?)) } +/// Read a `i64` from a reader. +pub fn read_i64(reader: &mut dyn BufRead) -> io::Result { + Ok(i64::from_le_bytes(read_bytes::<8>(reader)?)) +} + +/// Read a `u64` from a reader. +pub fn read_u64(reader: &mut dyn BufRead) -> io::Result { + Ok(u64::from_le_bytes(read_bytes::<8>(reader)?)) +} + /// Read a `f32` from a reader. -pub fn read_f32(reader: &mut dyn BufRead) -> Result { +pub fn read_f32(reader: &mut dyn BufRead) -> io::Result { Ok(f32::from_le_bytes(read_bytes::<4>(reader)?)) } -/// Read a `bool` represented as an `i32` from a reader. -pub fn read_bool(reader: &mut dyn BufRead) -> Result { - let val = i32::from_le_bytes(read_bytes::<4>(reader)?); +/// Read a `f64` from a reader. +pub fn read_f64(reader: &mut dyn BufRead) -> io::Result { + Ok(f64::from_le_bytes(read_bytes::<8>(reader)?)) +} + +/// Read an integer (32-bit or 64-bit) from a reader, and convert it to a usize. +pub fn read_length(reader: &mut dyn BufRead, use_64_bit_length: bool) -> io::Result { + let len: usize = if use_64_bit_length { + read_u64(reader)?.try_into() + } else { + read_u32(reader)?.try_into() + } + .expect("TODO: invalid usize conversion"); + Ok(len) +} + +/// Read a `bool` represented as a single byte from a reader. +pub fn read_bool(reader: &mut dyn BufRead) -> io::Result { + let val = read_bytes::<1>(reader)?[0]; + match val { 0 => Ok(false), 1 => Ok(true), _ => Err(std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("Invalid i32 value for bool: '{}'", val), + format!("Invalid value for bool: '{}'", val), )), } } /// Read a variable-length array of bytes from a reader. -pub fn read_bytes_with_len( - reader: &mut dyn BufRead, - len: usize, -) -> Result, std::io::Error> { +pub fn read_bytes_with_len(reader: &mut dyn BufRead, len: usize) -> io::Result> { let mut bytes = vec![0u8; len]; reader.read_exact(&mut bytes)?; Ok(bytes) } +/// Read a string from a reader. +pub fn read_string(reader: &mut dyn BufRead, use_64_bit_length: bool) -> io::Result { + let len = read_length(reader, use_64_bit_length)?; + let mut bytes = read_bytes_with_len(reader, len)?; + // The GGUF C writer prior to `llama.cpp@103cfafc774f6feb3172b5d4d39681c965b17eba` + // wrote a null terminator at the end of strings. As a work-around, we remove + // them here. + if bytes.last() == Some(&0) { + // Remove the null terminator. + bytes.pop(); + } + Ok(String::from_utf8(bytes) + .expect("string was not valid utf-8 (TODO: make this a library error)")) +} + +/// +/// WRITERS +/// + +/// Write a `i8` from a writer. +pub fn write_i8(writer: &mut dyn Write, value: i8) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `u8` from a writer. +pub fn write_u8(writer: &mut dyn Write, value: u8) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `i16` from a writer. +pub fn write_i16(writer: &mut dyn Write, value: i16) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `u16` from a writer. +pub fn write_u16(writer: &mut dyn Write, value: u16) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + /// Write a `i32` from a writer. -pub fn write_i32(writer: &mut dyn Write, value: i32) -> Result<(), std::io::Error> { +pub fn write_i32(writer: &mut dyn Write, value: i32) -> io::Result<()> { writer.write_all(&value.to_le_bytes()) } /// Write a `u32` from a writer. -pub fn write_u32(writer: &mut dyn Write, value: u32) -> Result<(), std::io::Error> { +pub fn write_u32(writer: &mut dyn Write, value: u32) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `i64` from a writer. +pub fn write_i64(writer: &mut dyn Write, value: i64) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `u64` from a writer. +pub fn write_u64(writer: &mut dyn Write, value: u64) -> io::Result<()> { writer.write_all(&value.to_le_bytes()) } /// Write a `f32` from a writer. -pub fn write_f32(writer: &mut dyn Write, value: f32) -> Result<(), std::io::Error> { +pub fn write_f32(writer: &mut dyn Write, value: f32) -> io::Result<()> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `f64` from a writer. +pub fn write_f64(writer: &mut dyn Write, value: f64) -> io::Result<()> { writer.write_all(&value.to_le_bytes()) } /// Write a `bool` represented as an `i32` to a writer. -pub fn write_bool(writer: &mut dyn Write, value: bool) -> Result<(), std::io::Error> { +pub fn write_bool(writer: &mut dyn Write, value: bool) -> io::Result<()> { let int_value: i32 = if value { 1 } else { 0 }; writer.write_all(&int_value.to_le_bytes()) } +/// Write an integer (32-bit or 64-bit) to a writer, and convert it from a usize. +pub fn write_length(writer: &mut dyn Write, use_64_bit_length: bool, len: usize) -> io::Result<()> { + if use_64_bit_length { + write_u64(writer, len as u64) + } else { + write_u32(writer, len as u32) + } +} + +/// Read a string from a reader. +pub fn write_string( + writer: &mut dyn Write, + use_64_bit_length: bool, + value: &str, +) -> io::Result<()> { + write_length(writer, use_64_bit_length, value.len())?; + writer.write_all(value.as_bytes()) +} + +/// Write N zero bytes to a writer. +// TODO: is there a more efficient way to do this? +pub fn write_zero_bytes(writer: &mut dyn Write, n: usize) -> io::Result<()> { + for _ in 0..n { + writer.write_all(&[0u8])?; + } + Ok(()) +} + // NOTE: Implementation from #![feature(buf_read_has_data_left)] /// Check if there is any data left in the reader. -pub fn has_data_left(reader: &mut impl BufRead) -> Result { +pub fn has_data_left(reader: &mut impl BufRead) -> io::Result { reader.fill_buf().map(|b| !b.is_empty()) } diff --git a/crates/llm-base/Cargo.toml b/crates/llm-base/Cargo.toml index badcbdc6..0474d1d6 100644 --- a/crates/llm-base/Cargo.toml +++ b/crates/llm-base/Cargo.toml @@ -17,16 +17,19 @@ bytemuck = { workspace = true } rand = { workspace = true } serde = { workspace = true } thiserror = { workspace = true } +indexmap = { workspace = true } +memmap2 = { workspace = true } +tracing = { workspace = true } +llm-samplers = { workspace = true } partial_sort = "0.2.0" serde_bytes = "0.11" -memmap2 = { workspace = true } half = "2" -tokenizers = {version="0.13.4", default-features=false, features=["onig"]} +tokenizers = { version = "0.13.4", default-features = false, features = [ + "onig", +] } regex = "1.8" -tracing = { workspace = true } -llm-samplers = { workspace = true } [features] tokenizers-remote = ["tokenizers/http"] diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 0c9f3a5b..5340ec11 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -393,14 +393,11 @@ impl InferenceSession { for &tk in batch { let should_call_callback = Some(tk) != model.bot_token_id(); - let mut token = match model.tokenizer() { - crate::Tokenizer::Embedded(_) => model.tokenizer().token(tk as usize).to_vec(), - crate::Tokenizer::HuggingFace(_) => { - let mut tokens = self.tokens.clone(); - tokens.push(tk); + let mut token = { + let mut tokens = self.tokens.clone(); + tokens.push(tk); - get_newly_decoded_portion_huggingface(model, tokens, &self.decoded_tokens) - } + get_newly_decoded_portion(model, tokens, &self.decoded_tokens) }; if should_call_callback { @@ -559,16 +556,7 @@ impl InferenceSession { if next_token as TokenId == model.eot_token_id() { Err(InferenceError::EndOfText) } else { - let res = match model.tokenizer() { - crate::Tokenizer::Embedded(_) => { - model.tokenizer().token(next_token as usize).to_vec() - } - crate::Tokenizer::HuggingFace(_) => get_newly_decoded_portion_huggingface( - model, - self.tokens.clone(), - &self.decoded_tokens, - ), - }; + let res = get_newly_decoded_portion(model, self.tokens.clone(), &self.decoded_tokens); self.decoded_tokens.append(&mut res.clone()); Ok(res) @@ -894,7 +882,10 @@ impl Drop for InferenceSession { } } -fn get_newly_decoded_portion_huggingface( +// TODO: Cache results and/or find a more intelligent way to do this. +// At present, this will decode *all* tokens generated by the model, which is +// not ideal when it gets run on each new token. Perhaps only consider the last three for decoding? +fn get_newly_decoded_portion( model: &dyn Model, tokens: Vec, decoded_tokens: &[u8], @@ -924,7 +915,7 @@ pub enum InferenceError { /// /// Note that this error *can* be ignored and inference can continue, but the results are not guaranteed to be sensical. EndOfText, - #[error("the user-specified callback returned an error")] + #[error("the user-specified callback returned an error: {0}")] /// The user-specified callback returned an error. UserCallback(Box), /// Sampling returned an error. diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs index f0a88a8a..ebf71e77 100644 --- a/crates/llm-base/src/lib.rs +++ b/crates/llm-base/src/lib.rs @@ -8,13 +8,13 @@ #![deny(missing_docs)] mod inference_session; -mod loader; mod lora; mod quantize; -mod tokenizer; +pub mod loader; pub mod model; pub mod samplers; +pub mod tokenizer; pub mod util; use std::sync::{Arc, Mutex}; @@ -29,13 +29,10 @@ pub use inference_session::{ ModelKVMemoryType, RewindError, SnapshotError, }; pub use llm_samplers::prelude::{Sampler, SamplerChain}; -pub use loader::{ - load, load_progress_callback_stdout, ContainerType, FileType, FileTypeFormat, FormatMagic, - LoadError, LoadProgress, Loader, TensorLoader, -}; +pub use loader::{ContainerType, FileMagic, FileType, FileTypeFormat}; pub use lora::{LoraAdapter, LoraParameters}; pub use memmap2::Mmap; -pub use model::{Hyperparameters, KnownModel, Model, ModelContext, ModelParameters, OutputRequest}; +pub use model::{Model, ModelContext, ModelParameters, OutputRequest}; pub use quantize::{quantize, QuantizeError, QuantizeProgress}; pub use regex::Regex; pub use tokenizer::{ diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index f00b2974..58b4ab4d 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -1,25 +1,25 @@ +//! Functionality for loading models. Very barebones; designed to be driven by `llm`. + use std::{ - collections::HashMap, - error::Error, - fmt::{Debug, Display, Formatter}, + fmt::{Display, Formatter}, fs::File, - io::{BufRead, BufReader, Read, Seek, SeekFrom}, - path::{Path, PathBuf}, + io::{BufRead, BufReader, Seek, SeekFrom}, + path::Path, sync::Arc, }; use crate::{ - util, Hyperparameters, KnownModel, LoraAdapter, LoraParameters, ModelContext, ModelParameters, - TokenId, Tokenizer, TokenizerLoadError, TokenizerSource, + model::{HyperparametersReadError, ModelData, ModelLoadArgs, ModelLoadError}, + LoraAdapter, Model, ModelContext, ModelParameters, TokenizerLoadError, TokenizerSource, }; -pub use ggml::{format::FormatMagic, ContainerType}; +pub use ggml::{format::gguf::MetadataError, format::ContainerType, util::FileMagic}; use ggml::{ - format::{LoadError as FormatLoadError, PartialHyperparameters, TensorLoadInfo}, + format::gguf::{Gguf, GgufLoadError, Metadata, MetadataValue, MetadataValueType, TensorInfo}, + sys::llama::llama_ftype, Context, MAX_NAME_LENGTH, }; use memmap2::Mmap; use thiserror::Error; -use tracing::log; #[derive(Debug, PartialEq, Clone, Copy, Eq, Default)] /// Information about the file. @@ -29,19 +29,17 @@ pub struct FileType { /// The quantization version. pub quantization_version: u32, } -impl From for i32 { +impl From for llama_ftype { fn from(value: FileType) -> Self { - (value.quantization_version * ggml::QNT_VERSION_FACTOR) as i32 - + ggml::sys::llama::llama_ftype::from(value.format) as i32 + (value.quantization_version * ggml::QNT_VERSION_FACTOR) as llama_ftype + + llama_ftype::from(value.format) } } -impl TryFrom for FileType { - type Error = (); +impl TryFrom for FileType { + type Error = llama_ftype; - fn try_from(value: u32) -> Result { - let format = FileTypeFormat::try_from( - (value % ggml::QNT_VERSION_FACTOR) as ggml::sys::llama::llama_ftype, - )?; + fn try_from(value: llama_ftype) -> Result { + let format = FileTypeFormat::try_from((value % ggml::QNT_VERSION_FACTOR) as llama_ftype)?; Ok(Self { format, @@ -54,6 +52,23 @@ impl Display for FileType { write!(f, "{}_qnt{}", self.format, self.quantization_version) } } +impl FileType { + /// Helper function that reads the file type from the metadata and converts + /// it to the enum, or fails with a `HyperparametersReadError`. + pub fn read_for_hyperparameters( + metadata: &Metadata, + ) -> Result, HyperparametersReadError> { + metadata + .get_optional("general.file_type") + .and_then(|v| v.as_uint32()) + .map(|v| { + FileType::try_from(v as llama_ftype).map_err(|ftype| { + HyperparametersReadError::UnsupportedFileType { file_type: ftype } + }) + }) + .transpose() + } +} /// How the tensors are stored in GGML LLM models. #[derive(Debug, PartialEq, Clone, Copy, Eq, Default)] @@ -96,10 +111,10 @@ pub enum FileTypeFormat { /// The tensors are stored using the `Q6_K` quantization scheme. MostlyQ6_K, } -impl TryFrom for FileTypeFormat { - type Error = (); +impl TryFrom for FileTypeFormat { + type Error = llama_ftype; - fn try_from(value: ggml::sys::llama::llama_ftype) -> Result { + fn try_from(value: llama_ftype) -> Result { use ggml::sys::llama::*; match value { LLAMA_FTYPE_ALL_F32 => Ok(FileTypeFormat::F32), @@ -119,11 +134,12 @@ impl TryFrom for FileTypeFormat { LLAMA_FTYPE_MOSTLY_Q5_K_S => Ok(FileTypeFormat::MostlyQ5_K_S), LLAMA_FTYPE_MOSTLY_Q5_K_M => Ok(FileTypeFormat::MostlyQ5_K_M), LLAMA_FTYPE_MOSTLY_Q6_K => Ok(FileTypeFormat::MostlyQ6_K), - _ => Err(()), + #[allow(clippy::unnecessary_cast)] + _ => Err(value), } } } -impl From for ggml::sys::llama::llama_ftype { +impl From for llama_ftype { fn from(value: FileTypeFormat) -> Self { use ggml::sys::llama::*; match value { @@ -175,10 +191,14 @@ impl Display for FileTypeFormat { } } +/// Helper trait that implements traits required for reading. +pub trait Source: BufRead + Seek {} +impl Source for S {} + /// Each variant represents a step within the process of loading the model. /// These can be used to report progress to the user. #[derive(Clone, PartialEq, Eq, Debug)] -pub enum LoadProgress { +pub enum LoadProgress<'a> { /// The hyperparameters have been loaded from the model. HyperparametersLoaded, /// The context has been created. @@ -189,9 +209,9 @@ pub enum LoadProgress { /// A tensor was patched with a LoRA. LoraApplied { /// The name of the patched tensor. - name: String, + name: &'a str, /// LoRA file the patch was applied from. - source: PathBuf, + source: &'a Path, }, /// A tensor from the current part has been loaded. TensorLoaded { @@ -212,33 +232,14 @@ pub enum LoadProgress { #[derive(Error, Debug)] /// Errors encountered during the loading process. pub enum LoadError { - #[error("the file {path:?} does not exist")] + #[error("the file does not exist")] /// The file does not exist. - FileDoesNotExist { - /// The path that failed. - path: PathBuf, - }, - #[error("could not open file {path:?}")] + FileDoesNotExist, + #[error("could not open file")] /// A file failed to open. OpenFileFailed { /// The original error. source: std::io::Error, - /// The path that failed. - path: PathBuf, - }, - #[error("no parent path for {path:?}")] - /// There is no parent path for a given path. - NoParentPath { - /// The path without a parent. - path: PathBuf, - }, - #[error("unable to read exactly {bytes} bytes")] - /// Reading exactly `bytes` from a file failed. - ReadExactFailed { - /// The original error. - source: std::io::Error, - /// The number of bytes that were attempted to be read. - bytes: usize, }, #[error("non-specific I/O error")] /// A non-specific IO error. @@ -249,17 +250,11 @@ pub enum LoadError { #[error("invalid integer conversion")] /// One of the integers encountered could not be converted to a more appropriate type. InvalidIntegerConversion(#[from] std::num::TryFromIntError), - #[error("unsupported ftype: {0}")] - /// The `ftype` hyperparameter had an invalid value. This usually means that the format used - /// by this file is unrecognized by this version of `llm`. - UnsupportedFileType(u32), - #[error("invalid magic number {magic} for {path:?}")] - /// An invalid magic number was encountered during the loading process. + #[error("invalid magic value {magic}")] + /// An invalid magic value was encountered during the loading process. InvalidMagic { - /// The path that failed. - path: PathBuf, - /// The magic number that was encountered. - magic: FormatMagic, + /// The magic value that was encountered. + magic: FileMagic, }, #[error("invalid file format {container_type:?}")] /// The version of the format is not supported by this version of `llm`. @@ -267,273 +262,196 @@ pub enum LoadError { /// The format that was encountered. container_type: ContainerType, }, - #[error("invalid value {ftype} for `f16` in hyperparameters")] - /// The `f16` hyperparameter had an invalid value. - HyperparametersF16Invalid { - /// The format type that was encountered. - ftype: i32, - }, - #[error("unknown tensor `{tensor_name}` in {path:?}")] - /// The tensor `tensor_name` was encountered during the loading of `path`, but was not seen during - /// the model prelude. - UnknownTensor { - /// The name of the tensor. - tensor_name: String, - /// The path that failed. - path: PathBuf, - }, - #[error("the tensor `{tensor_name}` has the wrong size in {path:?}")] - /// The tensor `tensor_name` did not match its expected size. - TensorWrongSize { - /// The name of the tensor. - tensor_name: String, - /// The path that failed. - path: PathBuf, - }, - /// The tensor `tensor_name` did not have the expected format type. - #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")] + /// The tensor `tensor_name` had an unsupported element type. + #[error("invalid element type {element_type} for tensor `{tensor_name}`")] UnsupportedElementType { /// The name of the tensor. tensor_name: String, - /// The format type that was encountered. - ftype: u32, - /// The path that failed. - path: PathBuf, + /// The element type that was encountered. + element_type: u32, }, - /// An invariant was broken. - /// - /// This error is not relevant unless `loader2` is being used. - #[error("invariant broken: {invariant} in {path:?}")] - InvariantBroken { - /// The path that failed. - path: Option, - /// The invariant that was broken. - invariant: String, + /// The tokenizer could not be loaded. + #[error("could not load tokenizer: {0}")] + TokenizerLoadFail(#[from] TokenizerLoadError), + /// The quantization version was missing, despite this model containing quantized tensors. + #[error("quantization version was missing, despite model containing quantized tensors")] + MissingQuantizationVersion, + /// The quantization version is not supported by this version of `llm`. + #[error("quantization version {quantization_version:?} is not supported")] + UnsupportedQuantizationVersion { + /// The quantization version that was encountered. + quantization_version: MetadataValue, }, - /// The model could not be created. - /// - /// This implies that there were no tensors in the model to be loaded. - /// - /// This error is not relevant unless `loader2` is being used. - #[error("could not create model from {path:?}")] - ModelNotCreated { - /// The path that failed. - path: PathBuf, + /// The model expected a metadata key-value pair, but the key was missing. + #[error("missing metadata key {key:?}")] + MissingMetadataKey { + /// The key that was missing. + key: String, }, - /// Multiple parts of the model were found. - /// - /// Multi-part models are not supported. Please convert the model to a single part. - #[error("multipart models are not supported")] - MultipartNotSupported { - /// The paths that were found. - paths: Vec, + /// The metadata key-value pair was not of the expected type. + #[error("metadata key {key:?} was not of the expected type")] + InvalidMetadataType { + /// The key with the invalid type. + key: String, + /// The expected type. + expected_type: MetadataValueType, + /// The actual type. + actual_type: MetadataValueType, }, - /// The tokenizer could not be loaded. - #[error("could not load tokenizer {path:?}: {error}")] - TokenizerLoadFail { - /// The invalid tokenizer path - path: PathBuf, - - /// The error that occurred. - error: Box, + /// The file type within the model was not supported by this version of `llm`. + #[error("file type {file_type} is not supported")] + UnsupportedFileType { + /// The file type (ignoring the quantization version) that was encountered. + file_type: llama_ftype, }, - /// There is insufficient information to guess the model architecture from the provided file. - /// - /// A model architecture must be provided to load the model. - #[error( - "could not guess model architecture from {path:?}. Please provide a model architecture." - )] - MissingModelArchitecture { - /// The path that failed. - path: PathBuf, + /// The architecture in the file is not known to the loader. + #[error("unknown architecture {architecture}")] + UnknownArchitecture { + /// The architecture that was encountered. + architecture: String, }, + /// An error occurred while reading the hyperparameters. + #[error("{0}")] + HyperparametersReadError(#[from] HyperparametersReadError), + /// An error occurred while loading the concrete model. + #[error("{0}")] + ModelLoadError(#[from] ModelLoadError), } -impl From for LoadError { - fn from(value: util::FindAllModelFilesError) -> Self { - match value { - util::FindAllModelFilesError::NoParentPath { path } => LoadError::NoParentPath { path }, - util::FindAllModelFilesError::IO(err) => LoadError::Io(err), - } - } -} -impl From for LoadError { - fn from(value: TokenizerLoadError) -> Self { - LoadError::TokenizerLoadFail { - path: value.path, - error: value.error, - } - } -} - -impl LoadError { - #[doc(hidden)] - pub fn from_format_error(value: FormatLoadError, path: PathBuf) -> Self { +impl From for LoadError { + fn from(value: GgufLoadError) -> Self { match value { - FormatLoadError::InvalidMagic(magic) => LoadError::InvalidMagic { path, magic }, - FormatLoadError::InvalidFormatVersion(container_type) => { + GgufLoadError::InvalidMagic(magic) => LoadError::InvalidMagic { magic }, + GgufLoadError::InvalidFormatVersion(container_type) => { LoadError::InvalidFormatVersion { container_type } } - FormatLoadError::Io(err) => LoadError::Io(err), - FormatLoadError::InvalidUtf8(err) => LoadError::InvalidUtf8(err), - FormatLoadError::InvalidIntegerConversion(err) => { + GgufLoadError::Io(err) => LoadError::Io(err), + GgufLoadError::InvalidUtf8(err) => LoadError::InvalidUtf8(err), + GgufLoadError::InvalidIntegerConversion(err) => { LoadError::InvalidIntegerConversion(err) } - FormatLoadError::ImplementationError(err) => err, - FormatLoadError::UnsupportedElementType { tensor_name, ftype } => { + GgufLoadError::UnsupportedElementType { tensor_name, ftype } => { LoadError::UnsupportedElementType { - path, tensor_name, - ftype, + element_type: ftype, } } - FormatLoadError::InvariantBroken(invariant) => LoadError::InvariantBroken { - path: Some(path), - invariant, - }, } } } +impl From for LoadError { + fn from(value: MetadataError) -> Self { + Self::HyperparametersReadError(HyperparametersReadError::MetadataError(value)) + } +} -/// Used by models to fetch tensors from a loader. -pub trait TensorLoader { - /// Gets a tensor from the loader. - fn load(&mut self, name: &str) -> Result; - /// Finish loading the model, returning the context. - fn finish(self) -> ModelContext; +/// When given args, attempt to instantiate a model. +pub type ModelLoadCallback = fn(ModelLoadArgs) -> Result, ModelLoadError>; + +/// A factory that can retrieve the constructor for a given model architecture. +pub trait ModelFactory { + /// For a given architecture name, return a function that will load the model, + /// or `None` if the architecture is not supported. + fn load(&self, architecture: &str) -> Option; } -/// Load a GGML model from the `path` and configure it per the `params`. The status -/// of the loading process will be reported through `load_progress_callback`. -/// -/// Note that the model must be a single-part model, and the model in `path` -/// *must* match the architecture of `M`. -/// -/// # Panics +/// Loads the specified GGUF model from disk, determining its architecture from the metadata. /// -/// - If the model does not match the architecture of `M`. This is not checked -/// before execution, so this function will panic if the model does not match -/// the architecture. -/// -/// This is a limitation of the GGML format, which does not -/// store any information about the architecture. -pub fn load( +/// This method returns a [`Box`], which means that the model will have single ownership. +/// If you'd like to share ownership (i.e. to use the model in multiple threads), we +/// suggest using [`Arc::from(Box)`](https://doc.rust-lang.org/std/sync/struct.Arc.html#impl-From%3CBox%3CT,+Global%3E%3E-for-Arc%3CT%3E) +/// to convert the [`Box`] into an [`Arc`] after loading. +pub fn load( path: &Path, tokenizer_source: TokenizerSource, params: ModelParameters, - load_progress_callback: impl FnMut(LoadProgress), -) -> Result { + model_factory: impl ModelFactory, + mut load_progress_callback: impl FnMut(LoadProgress), +) -> Result, LoadError> { if !path.exists() { - return Err(LoadError::FileDoesNotExist { - path: path.to_owned(), - }); + return Err(LoadError::FileDoesNotExist); } - let paths = util::find_all_model_files(path)?; - if paths.len() != 1 { - return Err(LoadError::MultipartNotSupported { paths }); - } - - let file = File::open(path).map_err(|e| LoadError::OpenFileFailed { - source: e, - path: path.to_owned(), - })?; + let file = File::open(path).map_err(|e| LoadError::OpenFileFailed { source: e })?; let mut reader = BufReader::new(&file); - log::trace!("Read model file from {:?}", path); - - let tokenizer = tokenizer_source.retrieve(path)?; - let mut loader = Loader::new(tokenizer, load_progress_callback); - - ggml::format::load(&mut reader, &mut loader) - .map_err(|err| LoadError::from_format_error(err, path.to_owned()))?; - log::trace!("Loaded GGML model from reader"); - - let Loader { - hyperparameters, - tokenizer, - tensors, - mut load_progress_callback, - container_type, - .. - } = loader; - - let quantization_version = (&hyperparameters as &M::Hyperparameters) - .file_type() - .map(|ft| ft.quantization_version) - .unwrap_or_default(); - let quantization_version = if quantization_version == 0 { - // HACK: I think llama.cpp does not actually write the quantization version correctly, - // so we need to guess it from the container type. - if container_type == ggml::ContainerType::Ggjt(2) { - 1 - } else if container_type == ggml::ContainerType::Ggjt(3) { - 2 - } else { - quantization_version - } - } else { - quantization_version - }; - log::trace!( + tracing::trace!("Read model file from {:?}", path); + + let gguf = Gguf::load(&mut reader)?; + tracing::trace!("Loaded GGML model from reader"); + + let architecture = gguf.metadata.get_str("general.architecture")?; + let tokenizer = tokenizer_source.retrieve(&gguf)?; + + let quantization_version = gguf.metadata.get_optional("general.quantization_version"); + tracing::trace!( "Determined quantization version of model as {:?}", quantization_version ); // TODO: this is temporary while we figure out how to handle this - if tensors.values().any(|t| t.element_type.is_quantized()) { - assert_eq!(quantization_version, 2, "quantization version must be 2"); + let any_quantized = gguf + .tensor_infos + .values() + .any(|t| t.element_type.is_quantized()); + if any_quantized { + match quantization_version { + Some(MetadataValue::UInt32(2)) => { + // Currently supported version + } + Some(quantization_version) => { + return Err(LoadError::UnsupportedQuantizationVersion { + quantization_version: quantization_version.clone(), + }) + } + None => return Err(LoadError::MissingQuantizationVersion), + } } - let use_mmap = - params.prefer_mmap && container_type.support_mmap() && params.lora_adapters.is_none(); + let use_mmap = params.prefer_mmap && params.lora_adapters.is_none(); - let ctx_size = tensors + let ctx_size = gguf + .tensor_infos .values() .map(|ti| ti.calc_absolute_size(use_mmap)) .sum::(); - log::trace!("Context size: {:?}", ctx_size); + tracing::trace!("Context size: {:?}", ctx_size); let mut lora_adapters: Option> = None; if let Some(lora_paths) = ¶ms.lora_adapters { let adapters: Result, _> = lora_paths - .iter() - .map(|lora_path| { - // Read the LoRA file - let lora_file = File::open(lora_path).map_err(|e| LoadError::OpenFileFailed { - source: e, - path: lora_path.to_owned(), - })?; - let mut lora_reader = BufReader::new(&lora_file); - // TODO: Consider updating the progress callback to report the progress of the LoRA file. - // Most LoRAs are small enough that this is not necessary, but it would be nice to have. - let mut lora_loader: Loader = - Loader::new(Tokenizer::empty_embedded(), |_| {}); - ggml::format::load(&mut lora_reader, &mut lora_loader) - .map_err(|err| LoadError::from_format_error(err, lora_path.to_owned()))?; - - // Collect the names of the tensors that should be patched - let tensors_to_patch = lora_loader - .tensors - .keys() - .filter_map(|k| Some(k.rsplit_once('.')?.0.to_owned())) - .collect(); - - log::trace!("Loaded LoRA weights"); - // Return the LoRA patches - Ok::<_, LoadError>(LoraAdapter { - scaling: lora_loader.hyperparameters.calculate_scaling(), - tensors: lora_loader.tensors, - tensors_to_patch, - file: lora_file, - path: lora_path.to_owned(), - }) + .iter() + .map(|lora_path| { + // Read the LoRA file + let lora_file = File::open(lora_path).map_err(|e| LoadError::OpenFileFailed { + source: e, + })?; + let mut lora_reader = BufReader::new(&lora_file); + let gguf = Gguf::load(&mut lora_reader)?; + + // Collect the names of the tensors that should be patched + let tensors_to_patch = gguf + .tensor_infos + .keys() + .filter_map(|k| Some(k.rsplit_once('.')?.0.to_owned())) + .collect(); + + tracing::trace!("Loaded LoRA weights"); + // Return the LoRA patches + #[allow(unreachable_code)] + Ok::<_, LoadError>(LoraAdapter { + tensors: gguf.tensor_infos.clone(), + tensors_to_patch, + source: Box::new(lora_reader), + path: lora_path.to_owned(), + gguf, + scaling: todo!("Calculate scaling from LoRA file metadata (GGUF does not have standardised metadata yet)"), }) - .collect(); + }) + .collect(); lora_adapters = Some(adapters?); } (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size }); let (context, file_size) = if use_mmap { - let file = File::open(path)?; unsafe { let mmap = Mmap::map(&file)?; let file_size = mmap.len() as u64; @@ -543,204 +461,158 @@ pub fn load( (Context::new_with_allocate(ctx_size), file.metadata()?.len()) }; - let tensors_len = tensors.len(); - let tl = MmapCompatibleLoader { - path: path.to_owned(), - file, - tensors, - context, - lora_adapters, - load_progress_callback: &mut load_progress_callback, - loaded_tensors: Default::default(), - }; - - let model = KnownModel::new(hyperparameters, params, tokenizer, tl)?; + let model_constructor = + model_factory + .load(architecture) + .ok_or_else(|| LoadError::UnknownArchitecture { + architecture: architecture.to_string(), + })?; + let model = (model_constructor)(ModelLoadArgs { + gguf: &gguf, + data: ModelData { params, tokenizer }, + tensor_loader: ModelTensorLoader { + tensor_loader: TensorLoader { + source: &mut reader, + gguf: &gguf, + context, + }, + lora_adapters, + progress_callback: &mut load_progress_callback, + loaded_tensor_count: 0, + }, + })?; (load_progress_callback)(LoadProgress::Loaded { file_size, - tensor_count: tensors_len, + tensor_count: gguf.tensor_infos.len(), }); - log::trace!("Loaded model"); + tracing::trace!("Loaded model"); Ok(model) } -/// A GGML format loader for LLMs. -pub struct Loader { - // Input - load_progress_callback: F, - - // Input/Output - /// The tokenizer of the model. - pub tokenizer: Tokenizer, - - // Output - /// The container type of the model. - pub container_type: ContainerType, - /// The hyperparameters of the model. - pub hyperparameters: Hp, - /// The tensors of the model. - pub tensors: HashMap, -} -impl Loader { - /// Creates a new loader. - pub fn new(tokenizer: Tokenizer, load_progress_callback: F) -> Self { - Self { - load_progress_callback, - - container_type: ContainerType::Ggml, - hyperparameters: Hp::default(), - tokenizer, - tensors: HashMap::default(), +/// A implementation for `load_progress_callback` that outputs to `stdout`. +pub fn load_progress_callback_stdout(progress: LoadProgress) { + match progress { + LoadProgress::HyperparametersLoaded => println!("Loaded hyperparameters"), + LoadProgress::ContextSize { bytes } => println!( + "ggml ctx size = {:.2} MB\n", + bytes as f64 / (1024.0 * 1024.0) + ), + LoadProgress::TensorLoaded { + current_tensor, + tensor_count, + .. + } => { + let current_tensor = current_tensor + 1; + if current_tensor % 8 == 0 { + println!("Loaded tensor {current_tensor}/{tensor_count}"); + } } - } -} -impl ggml::format::LoadHandler - for Loader -{ - fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> { - self.container_type = container_type; - Ok(()) - } - - fn vocabulary_token(&mut self, i: usize, token: Vec, score: f32) -> Result<(), LoadError> { - if let Tokenizer::Embedded(mv) = &mut self.tokenizer { - let id = match TokenId::try_from(i) { - Ok(id) => id, - Err(err) => return Err(LoadError::InvalidIntegerConversion(err)), - }; - - mv.push_token(id, token, score); + LoadProgress::Loaded { + file_size: byte_size, + tensor_count, + } => { + println!("Loading of model complete"); + println!( + "Model size = {:.2} MB / num tensors = {}", + byte_size as f64 / 1024.0 / 1024.0, + tensor_count + ); } - - Ok(()) - } - - fn read_hyperparameters( - &mut self, - reader: &mut dyn BufRead, - ) -> Result { - // NOTE: Field order matters! Data is laid out in the file exactly in this order. - let hyperparameters = Hp::read_ggml(reader)?; - let partial = PartialHyperparameters { - n_vocab: hyperparameters.n_vocabulary(), - }; - self.hyperparameters = hyperparameters; - (self.load_progress_callback)(LoadProgress::HyperparametersLoaded); - - Ok(partial) - } - - fn tensor_buffer(&mut self, info: TensorLoadInfo) -> Result<(), LoadError> { - self.tensors.insert(info.name.clone(), info); - Ok(()) - } + LoadProgress::LoraApplied { name, source } => { + println!( + "Patched tensor {} via LoRA from '{}'", + name, + source.file_name().unwrap().to_str().unwrap() + ); + } + }; } -struct MmapCompatibleLoader<'a> { - path: PathBuf, - file: File, - tensors: HashMap, - context: Context, - lora_adapters: Option>, - load_progress_callback: &'a mut dyn FnMut(LoadProgress), - loaded_tensors: HashMap, +/// A helper struct for loading tensors from a model. +pub struct ModelTensorLoader<'a> { + pub(crate) tensor_loader: TensorLoader<'a>, + pub(crate) lora_adapters: Option>, + pub(crate) progress_callback: &'a mut dyn FnMut(LoadProgress), + pub(crate) loaded_tensor_count: usize, } -impl TensorLoader for MmapCompatibleLoader<'_> { - fn load(&mut self, name: &str) -> Result { - let info = self.tensors.get(name).ok_or(LoadError::UnknownTensor { - tensor_name: String::from(name), - path: Default::default(), - })?; - - let mut main_context = FileContext::new(&self.context, &mut self.file, &self.path); - - let mut tensor = main_context.get_tensor(info)?; +impl ModelTensorLoader<'_> { + /// Load a tensor from the model. + pub fn load(&mut self, name: &str) -> Result { + let (mut tensor, info) = self.tensor_loader.load(name)?; if let Some(lora_adapters) = &mut self.lora_adapters { for lora_adapter in lora_adapters { - lora_adapter.patch(info, &mut tensor)?; - (self.load_progress_callback)(LoadProgress::LoraApplied { - name: name.to_owned(), - source: lora_adapter.path.to_owned(), + lora_adapter.patch(name, info, &mut tensor)?; + (self.progress_callback)(LoadProgress::LoraApplied { + name, + source: &lora_adapter.path, }); } } - (self.load_progress_callback)(LoadProgress::TensorLoaded { - current_tensor: self.loaded_tensors.len(), - tensor_count: self.tensors.len(), + self.loaded_tensor_count += 1; + (self.progress_callback)(LoadProgress::TensorLoaded { + current_tensor: self.loaded_tensor_count, + tensor_count: self.tensor_loader.gguf.tensor_infos.len(), }); - self.loaded_tensors.insert(name.to_owned(), tensor.share()); Ok(tensor) } - fn finish(self) -> ModelContext { + /// Finish loading tensors from the model, and get the model context. + pub fn finish(self) -> ModelContext { // We can ignore this warning as it's OK to share this particular // context around, being that it is immutable. #[allow(clippy::arc_with_non_send_sync)] - ModelContext(Arc::new(self.context)) + ModelContext(Arc::new(self.tensor_loader.finish())) } } -pub(crate) struct FileContext<'a> { - context: &'a Context, - file: &'a mut File, - path: &'a Path, +pub(crate) struct TensorLoader<'a> { + pub source: &'a mut dyn Source, + pub gguf: &'a Gguf, + pub context: Context, } -impl<'a> FileContext<'a> { - pub(crate) fn new(context: &'a Context, file: &'a mut File, path: &'a Path) -> Self { - Self { - context, - file, - path, - } - } - - pub(crate) fn get_tensor(&mut self, info: &TensorLoadInfo) -> Result { - let name = &info.name; - let ne = info.dims(); - let dims = ne.len(); - - if dims != info.n_dims { - return Err(LoadError::InvariantBroken { - path: Some(self.path.to_owned()), - invariant: format!( - "the tensor {name} should have {} dimensions, not {}", - info.n_dims, dims - ), - }); - } - - let mut tensor = match dims { - 1 => self.context.new_tensor_1d(info.element_type, ne[0]), - 2 => self.context.new_tensor_2d(info.element_type, ne[0], ne[1]), - 3 => self - .context - .new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]), - _ => { - return Err(LoadError::InvariantBroken { - path: Some(self.path.to_owned()), - invariant: format!( - "the tensor {name} should have between 1 and 3 dimensions, not {dims}" - ), - }) +impl TensorLoader<'_> { + pub fn load(&mut self, name: &str) -> Result<(ggml::Tensor, &TensorInfo), TensorLoadError> { + let info = self + .gguf + .tensor_infos + .get(name) + .ok_or(TensorLoadError::UnknownTensor { + tensor_name: String::from(name), + })?; + + let ty = info.element_type; + let dims = &info.dimensions; + + let mut tensor = match dims.len() { + 1 => self.context.new_tensor_1d(ty, dims[0]), + 2 => self.context.new_tensor_2d(ty, dims[0], dims[1]), + 3 => self.context.new_tensor_3d(ty, dims[0], dims[1], dims[2]), + other => { + return Err(TensorLoadError::UnsupportedTensorDimensionCount { + tensor_name: name.to_string(), + dimensions: other, + }); } }; + let offset = self.gguf.tensor_data_position + info.offset; match self.context.storage().as_mmap() { Some(mmap) => unsafe { - let ptr = mmap.as_ptr().offset(info.start_offset as isize); + let ptr = mmap.as_ptr().offset(offset as isize); tensor.set_data(ptr as *mut std::ffi::c_void); }, None => { let buf: &mut [u8] = unsafe { std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes()) }; - self.file.seek(SeekFrom::Start(info.start_offset))?; - self.file.read_exact(buf)?; + self.source.seek(SeekFrom::Start(offset))?; + self.source.read_exact(buf)?; } } @@ -751,45 +623,35 @@ impl<'a> FileContext<'a> { name }; - Ok(tensor.set_name(tensor_name)) + Ok((tensor.set_name(tensor_name), info)) + } + + pub fn finish(self) -> Context { + self.context } } -/// A implementation for `load_progress_callback` that outputs to `stdout`. -pub fn load_progress_callback_stdout(progress: LoadProgress) { - match progress { - LoadProgress::HyperparametersLoaded => println!("Loaded hyperparameters"), - LoadProgress::ContextSize { bytes } => println!( - "ggml ctx size = {:.2} MB\n", - bytes as f64 / (1024.0 * 1024.0) - ), - LoadProgress::TensorLoaded { - current_tensor, - tensor_count, - .. - } => { - let current_tensor = current_tensor + 1; - if current_tensor % 8 == 0 { - println!("Loaded tensor {current_tensor}/{tensor_count}"); - } - } - LoadProgress::Loaded { - file_size: byte_size, - tensor_count, - } => { - println!("Loading of model complete"); - println!( - "Model size = {:.2} MB / num tensors = {}", - byte_size as f64 / 1024.0 / 1024.0, - tensor_count - ); - } - LoadProgress::LoraApplied { name, source } => { - println!( - "Patched tensor {} via LoRA from '{}'", - name, - source.file_name().unwrap().to_str().unwrap() - ); - } - }; +#[derive(Error, Debug)] +/// Errors encountered during loaing of tensors. +pub enum TensorLoadError { + #[error("unknown tensor `{tensor_name}`")] + /// The tensor `tensor_name` is required for this model architecture, + /// but was not found in the model. + UnknownTensor { + /// The name of the tensor. + tensor_name: String, + }, + /// A tensor with an unsupported number of dimensions was encountered. + #[error( + "tensor {tensor_name} has {dimensions} dimensions, but only 1-3 dimensions are supported" + )] + UnsupportedTensorDimensionCount { + /// The name of the tensor. + tensor_name: String, + /// The number of dimensions that were encountered. + dimensions: usize, + }, + #[error("non-specific I/O error")] + /// A non-specific IO error. + Io(#[from] std::io::Error), } diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs index f433931e..c034fdaa 100644 --- a/crates/llm-base/src/lora.rs +++ b/crates/llm-base/src/lora.rs @@ -1,14 +1,11 @@ -use crate::{ - loader::FileContext, model::HyperparametersWriteError, util, FileType, Hyperparameters, - LoadError, -}; +use crate::loader::{Source, TensorLoadError, TensorLoader}; -use ggml::{format::TensorLoadInfo, GraphExecutionPlan}; -use std::{ - collections::{HashMap, HashSet}, - fs::File, - path::PathBuf, +use ggml::{ + format::gguf::{Gguf, TensorInfo}, + GraphExecutionPlan, }; +use indexmap::IndexMap; +use std::{collections::HashSet, path::PathBuf}; #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] /// Parameters for a [LoRA](https://arxiv.org/abs/2106.09685) adapter. @@ -24,70 +21,49 @@ impl LoraParameters { (self.alpha as f32) / (self.r as f32) } } -impl Hyperparameters for LoraParameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - Ok(LoraParameters { - r: util::read_i32(reader)?, - alpha: util::read_i32(reader)?, - }) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.r)?; - util::write_i32(writer, self.alpha)?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - // LoRA adapters do not have a vocabulary. - 0 - } - - fn file_type(&self) -> Option { - None - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - None - } -} /// [LoRA](https://arxiv.org/abs/2106.09685) adapter for a model. pub struct LoraAdapter { /// Scaling to apply to the LoRA weights. pub scaling: f32, /// The tensors of the LoRA. - pub tensors: HashMap, + pub tensors: IndexMap, /// Names of the tensors that should be patched. pub tensors_to_patch: HashSet, - /// File containing the LoRA weights. - pub file: File, + /// Source containing the LoRA weights. + pub source: Box, /// Path to the LoRA file. pub path: PathBuf, + /// The loaded GGUF for the LoRA. + pub gguf: Gguf, } impl LoraAdapter { /// Patch a tensor via LoRA pub fn patch( &mut self, - info: &TensorLoadInfo, + name: &str, + info: &TensorInfo, tensor: &mut ggml::Tensor, - ) -> Result<(), LoadError> { + ) -> Result<(), TensorLoadError> { // Check if we need to patch this tensor - let name = &info.name; if !self.tensors_to_patch.contains(name) { return Ok(()); } - let a_info = self.get_info(&format!("{}.loraA", name))?; - let b_info = self.get_info(&format!("{}.loraB", name))?; + let a_name = format!("{}.loraA", name); + let a_info = self.get_info(&a_name)?; + + let b_name = format!("{}.loraB", name); + let b_info = self.get_info(&b_name)?; let must_scale = self.scaling != 1.0; // Calculate the size of the patch context via the following steps: // 1. Calculate the size of the two `a` and `b` tensors // 2. Calculate the size of the original tensor // 3. Calculate the size of the `ba` and tensors. It has the same dimensions as the original tensor, but is of the element type of the `a` or `b` tensor e.g. fp16 - let ba_size = ggml::format::tensor_size(a_info.element_type, info.dims().iter().product()); + let ba_size = + ggml::format::tensor_size(a_info.element_type, info.dimensions.iter().product()); let mut patch_context_size = a_info.calc_absolute_size(false) + b_info.calc_absolute_size(false) + info.calc_absolute_size(false) @@ -96,7 +72,7 @@ impl LoraAdapter { // 3b. (Optional) If we need to scale the `ba` tensor, we need to allocate for a second `ba` and the `scaled` tensors which will be crated as an `f32` tensor. if must_scale { let scaled_size = - ggml::format::tensor_size(ggml::ElementType::F32, info.dims().iter().product()); + ggml::format::tensor_size(ggml::ElementType::F32, info.dimensions.iter().product()); patch_context_size += scaled_size + ba_size; } @@ -106,14 +82,18 @@ impl LoraAdapter { // Create a temporary context for the patching operations // TODO: test if GPU can be enabled (make it configurable) let patch_context = ggml::Context::new_with_allocate(patch_context_size); - let mut patch_file = FileContext::new(&patch_context, &mut self.file, &self.path); + let mut loader = TensorLoader { + source: self.source.as_mut(), + context: patch_context, + gguf: &self.gguf, + }; // Load the A and B tensors - let a = patch_file.get_tensor(&a_info)?; - let b = patch_file.get_tensor(&b_info)?; - - //Build a ggml context and apply the patch + let (a, _) = loader.load(&a_name)?; + let (b, _) = loader.load(&b_name)?; + // Build a ggml context and apply the patch + let patch_context = loader.finish(); let mut gf = patch_context.create_compute_graph(); // LoRA formula: w = w + ba*s @@ -142,12 +122,11 @@ impl LoraAdapter { Ok(()) } - fn get_info(&self, name: &str) -> Result { + fn get_info(&self, name: &str) -> Result { self.tensors .get(name) .cloned() - .ok_or(LoadError::UnknownTensor { - path: self.path.to_owned(), + .ok_or(TensorLoadError::UnknownTensor { tensor_name: name.to_owned(), }) } diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index ab30e4f2..916f62e1 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -1,54 +1,59 @@ //! Large language model traits and types -use std::{ - error::Error, - fmt::Debug, - io::{BufRead, Write}, - path::{Path, PathBuf}, - sync::Arc, -}; +use std::{fmt::Debug, path::PathBuf, sync::Arc}; -use ggml::accelerator::Backend; +use ggml::{ + accelerator::Backend, + format::gguf::{Gguf, MetadataError}, + sys::llama::llama_ftype, +}; use regex::Regex; use thiserror::Error; use crate::{ - loader::TensorLoader, tokenizer::TokenId, FileType, InferenceSession, InferenceSessionConfig, - LoadError, LoadProgress, Tokenizer, TokenizerSource, + loader::{ModelTensorLoader, TensorLoadError}, + tokenizer::TokenId, + InferenceSession, InferenceSessionConfig, Tokenizer, }; /// Common functions for model evaluation pub mod common; -/// Interfaces for creating and interacting with a large language model with a known type -/// of [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)). -pub trait KnownModel: Send + Sync { - /// Hyperparameters for the model. - type Hyperparameters: Hyperparameters; +/// All of the arguments required to load a model. +pub struct ModelLoadArgs<'a> { + /// The GGUF metadata for the model. + pub gguf: &'a Gguf, + /// Model metadata. + pub data: ModelData, + /// The tensor loader to use for the model. + pub tensor_loader: ModelTensorLoader<'a>, +} - /// Load this model from the `path` and configure it per the `params`. The status - /// of the loading process will be reported through `load_progress_callback`. This - /// is a helper function on top of [llm_base::load](crate::load). - fn load( - path: &Path, - tokenizer_source: TokenizerSource, - params: ModelParameters, - load_progress_callback: impl FnMut(LoadProgress), - ) -> Result - where - Self: Sized, - { - crate::load(path, tokenizer_source, params, load_progress_callback) - } +/// Model data that is required for all models. +pub struct ModelData { + /// Any parameters that control the behaviour of the model. + pub params: ModelParameters, + /// The tokenizer to use for the model. + pub tokenizer: Tokenizer, +} +/// An error encountered while loading a concrete model. +#[derive(Error, Debug)] +pub enum ModelLoadError { + /// An error occurred while loading the model's tensors. + #[error("{0}")] + TensorLoadError(#[from] TensorLoadError), + /// An error occurred while reading the model's hyperparameters. + #[error("{0}")] + HyperparametersReadError(#[from] HyperparametersReadError), +} + +/// Interfaces for creating and interacting with a large language model with a known type +/// of [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)). +pub trait Model: Send + Sync { /// Creates a new model from the provided [ModelParameters] hyperparameters. /// This function is called by the [load](crate::loader::load) function. - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl TensorLoader, - ) -> Result + fn new(args: ModelLoadArgs) -> Result where Self: Sized; @@ -66,15 +71,19 @@ pub trait KnownModel: Send + Sync { output_request: &mut OutputRequest, ); - /// Get the hyperparameters for this model. - fn hyperparameters(&self) -> &Self::Hyperparameters; + /// Get the data for this model. + fn data(&self) -> &ModelData; /// Get the tokenizer for this model. - fn tokenizer(&self) -> &Tokenizer; + fn tokenizer(&self) -> &Tokenizer { + &self.data().tokenizer + } /// Get the context size (configured with [ModelParameters::context_size]) used by /// this model. - fn context_size(&self) -> usize; + fn context_size(&self) -> usize { + self.data().params.context_size + } /// Get the beginning of text/beginning of string token ID, if available. This value is defined by model implementers. fn bot_token_id(&self) -> Option; @@ -83,10 +92,10 @@ pub trait KnownModel: Send + Sync { fn eot_token_id(&self) -> TokenId; /// Get the list of regexes to use to determine if a tensor in this model should be quantized. - fn quantize_tensors() -> Vec; + fn quantize_tensors(&self) -> Vec; /// Get the list of regexes to use to determine if a tensor in this model should not be quantized. - fn skip_quantize_tensors() -> Vec; + fn skip_quantize_tensors(&self) -> Vec; /// Returns whether the model supports deleting tokens. fn supports_rewind(&self) -> bool { @@ -95,107 +104,24 @@ pub trait KnownModel: Send + Sync { } } -/// A type-erased model to allow for interacting with a model without knowing -/// its hyperparameters. -pub trait Model: Send + Sync { - /// Starts a new `InferenceSession` for this model. - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession; - - /// This function is called by the provided [InferenceSession]; it will use this model - /// to generate output by evaluating the `input_tokens`. - /// The [OutputRequest] is used to specify additional data to fetch from the - /// model. - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ); - - /// Get the tokenizer for this model. - fn tokenizer(&self) -> &Tokenizer; - - /// Get the context size (configured with [ModelParameters::context_size]) used by - /// this model. - fn context_size(&self) -> usize; - - /// Get the beginning of text/beginning of string token ID, if available. This value is defined by model implementers. - fn bot_token_id(&self) -> Option; - - /// Get the end of text/end of string token ID. This value is defined by model implementers. - fn eot_token_id(&self) -> TokenId; - - /// Returns whether the model supports deleting tokens. - fn supports_rewind(&self) -> bool; -} -impl> Model for M { - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - KnownModel::start_session(self, config) - } - - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - KnownModel::evaluate(self, session, input_tokens, output_request) - } - - fn tokenizer(&self) -> &Tokenizer { - KnownModel::tokenizer(self) - } - - fn context_size(&self) -> usize { - KnownModel::context_size(self) - } - - fn bot_token_id(&self) -> Option { - KnownModel::bot_token_id(self) - } - - fn eot_token_id(&self) -> TokenId { - KnownModel::eot_token_id(self) - } - - fn supports_rewind(&self) -> bool { - KnownModel::supports_rewind(self) - } -} - -/// Implemented by model hyperparameters for interacting with hyperparameters -/// without knowing what they are, as well as writing/reading them as required. -pub trait Hyperparameters: Sized + Default + Debug + PartialEq + Eq { - /// Read the parameters in GGML format from a reader. - fn read_ggml(reader: &mut dyn BufRead) -> Result; - - /// Write the parameters in GGML format to a writer. - fn write_ggml(&self, writer: &mut dyn Write) -> Result<(), HyperparametersWriteError>; - - /// Get the number of tokens in the embedded vocabulary, if any. - fn n_vocabulary(&self) -> usize; - - /// Get the filetype of the model. - fn file_type(&self) -> Option; - - /// Get mutable access to filetype of the model. - fn file_type_mut(&mut self) -> Option<&mut FileType>; -} #[derive(Error, Debug)] -/// Reported from functions that write -pub enum HyperparametersWriteError { - #[error("non-specific I/O error")] - /// A non-specific IO error. - Io(#[from] std::io::Error), - #[error("invalid integer conversion")] - /// One of the integers encountered could not be converted to a more appropriate type. - InvalidIntegerConversion(#[from] std::num::TryFromIntError), +/// Reported from functions that read hyperparameters +pub enum HyperparametersReadError { + #[error("{0}")] + /// A metadata error. + MetadataError(#[from] MetadataError), + /// The file type within the model was not supported by this version of `llm`. + #[error("file type {file_type} is not supported")] + UnsupportedFileType { + /// The file type (ignoring the quantization version) that was encountered. + file_type: llama_ftype, + }, } /// Parameters for model-wide behaviour. #[derive(Debug, Clone)] pub struct ModelParameters { - /// For [GGML formats](ggml::ContainerType) that support it, [mmap](https://en.wikipedia.org/wiki/Mmap) + /// For [GGML formats](ggml::format::ContainerType) that support it, [mmap](https://en.wikipedia.org/wiki/Mmap) /// is the default. Although mmap typically improves performance, setting this value to `false` may /// be preferred in resource-constrained environments. pub prefer_mmap: bool, @@ -210,8 +136,6 @@ pub struct ModelParameters { pub gpu_layers: Option, /// The arguments/overrides to pass to the [custom RoPE](https://arxiv.org/pdf/2306.15595.pdf) function, if it is used by the model. pub rope_overrides: Option, - /// Enables gouped-query attention for Llama-2 70B model - pub n_gqa: Option, } impl Default for ModelParameters { @@ -223,7 +147,6 @@ impl Default for ModelParameters { use_gpu: false, gpu_layers: None, rope_overrides: None, - n_gqa: None, } } } diff --git a/crates/llm-base/src/quantize.rs b/crates/llm-base/src/quantize.rs index d3d2a0cf..0aa8cc61 100644 --- a/crates/llm-base/src/quantize.rs +++ b/crates/llm-base/src/quantize.rs @@ -1,10 +1,10 @@ +// TODO: Reimeplement entirely for GGUF! +#![allow(unused)] + //! Implements quantization of weights. -use crate::{ - loader::FileTypeFormat, model::HyperparametersWriteError, Hyperparameters, KnownModel, - LoadError, LoadProgress, Loader, Tokenizer, -}; -use ggml::format::{SaveError, SaveHandler, TensorLoadInfo, TensorSaveInfo}; +use crate::{loader::FileTypeFormat, Model, Tokenizer}; +use ggml::format::gguf::GgufSaveError; use half::f16; use regex::Regex; use std::{ @@ -16,7 +16,6 @@ use std::{ use thiserror::Error; #[derive(Clone, Debug)] - /// Progress of quantization. pub enum QuantizeProgress<'a> { /// Hyperparameters have been loaded. @@ -69,9 +68,9 @@ pub enum QuantizeProgress<'a> { #[derive(Error, Debug)] /// Errors encountered during the quantization process. pub enum QuantizeError { - #[error("could not load model")] - /// There was an error while attempting to load the model. - Load(#[from] LoadError), + // #[error("could not load model")] + // /// There was an error while attempting to load the model. + // Load(#[from] LoadError), #[error("non-specific I/O error")] /// A non-specific IO error. Io(#[from] std::io::Error), @@ -111,113 +110,112 @@ pub enum QuantizeError { /// The element type. element_type: ggml::Type, }, - /// An error was encountered while writing the hyperparameters. - #[error("an error was encountered while writing the hyperparameters")] - HyperparametersWriteError(#[source] HyperparametersWriteError), /// An attempt was made to save a model with a container type that does not /// support vocabulary scoring, despite the model having a scored vocabulary. #[error("container type does not support vocabulary scoring")] VocabularyScoringNotSupported, } impl QuantizeError { - pub(crate) fn from_format_error(value: SaveError, path: PathBuf) -> Self { - match value { - SaveError::Io(io) => QuantizeError::Io(io), - SaveError::InvalidIntegerConversion(e) => QuantizeError::InvalidIntegerConversion(e), - SaveError::ImplementationError(e) => e, - SaveError::InvariantBroken(invariant) => { - QuantizeError::InvariantBroken { path, invariant } - } - SaveError::VocabularyScoringNotSupported => { - QuantizeError::VocabularyScoringNotSupported - } - } + pub(crate) fn from_format_error(value: GgufSaveError, path: PathBuf) -> Self { + todo!() + // match value { + // SaveError::Io(io) => QuantizeError::Io(io), + // SaveError::InvalidIntegerConversion(e) => QuantizeError::InvalidIntegerConversion(e), + // SaveError::ImplementationError(e) => e, + // SaveError::InvariantBroken(invariant) => { + // QuantizeError::InvariantBroken { path, invariant } + // } + // SaveError::VocabularyScoringNotSupported => { + // QuantizeError::VocabularyScoringNotSupported + // } + // } } } /// Quantizes a model. -pub fn quantize( +pub fn quantize( reader: &mut R, writer: &mut W, tokenizer: Tokenizer, - save_container_type: ggml::format::SaveContainerType, quantization_type: ggml::Type, progress_callback: impl Fn(QuantizeProgress), ) -> Result<(), QuantizeError> { - // Sanity check - let quantization_target = QuantizationTarget::try_from(quantization_type).map_err(|_| { - QuantizeError::InvalidQuantizationTarget { - element_type: quantization_type, - } - })?; + // // Sanity check + // let quantization_target = QuantizationTarget::try_from(quantization_type).map_err(|_| { + // QuantizeError::InvalidQuantizationTarget { + // element_type: quantization_type, + // } + // })?; - // Load the model - let progress_callback = Arc::new(progress_callback); + // // Load the model + // let progress_callback = Arc::new(progress_callback); - let mut loader = Loader::::new(tokenizer, { - let progress_callback = progress_callback.clone(); - move |p| { - if let LoadProgress::HyperparametersLoaded = p { - progress_callback(QuantizeProgress::HyperparametersLoaded) - } - } - }); - ggml::format::load(reader, &mut loader) - .map_err(|err| LoadError::from_format_error(err, PathBuf::default()))?; - - // Save the quantized model, quantizing as we go - let Loader { - mut hyperparameters, - tokenizer, - tensors, - .. - } = loader; - - if let Some(ft) = hyperparameters.file_type_mut() { - ft.quantization_version = ggml::QNT_VERSION; - ft.format = quantization_target - .try_into() - .expect("format has no corresponding ftype"); - } + // let mut loader = Loader::::new(tokenizer, { + // let progress_callback = progress_callback.clone(); + // move |p| { + // if let LoadProgress::HyperparametersLoaded = p { + // progress_callback(QuantizeProgress::HyperparametersLoaded) + // } + // } + // }); + // ggml::format::ggml::load(reader, &mut loader) + // .map_err(|err| LoadError::from_format_error(err, PathBuf::default()))?; + + // // Save the quantized model, quantizing as we go + // let Loader { + // mut hyperparameters, + // tokenizer, + // tensors, + // .. + // } = loader; - let tokenizer = match tokenizer { - Tokenizer::Embedded(v) => v.iter().collect::>(), - Tokenizer::HuggingFace(_) => vec![], - }; - - let to_quantize = M::quantize_tensors(); - let to_skip = M::skip_quantize_tensors(); - let mut saver = QuantizeSaver::new( - quantization_target, - &hyperparameters, - &tensors, - &to_quantize, - &to_skip, - reader, - |p| progress_callback(p), - ); - ggml::format::save( - writer, - &mut saver, - save_container_type, - &tokenizer, - &tensors.keys().cloned().collect::>(), - ) - .map_err(|err| QuantizeError::from_format_error(err, PathBuf::default()))?; - - // Final report - let sum_all: i64 = saver.history_all.iter().sum(); - progress_callback(QuantizeProgress::Finished { - original_size: saver.total_size_original, - reduced_size: saver.total_size_new, - history: saver - .history_all - .iter() - .map(|hist| *hist as f32 / sum_all as f32) - .collect(), - }); - - Ok(()) + // if let Some(ft) = hyperparameters.file_type_mut() { + // ft.quantization_version = ggml::QNT_VERSION; + // ft.format = quantization_target + // .try_into() + // .expect("format has no corresponding ftype"); + // } + + // let tokenizer = match tokenizer { + // Tokenizer::Embedded(v) => v.iter().collect::>(), + // Tokenizer::HuggingFace(_) => vec![], + // }; + + // let to_quantize = M::quantize_tensors(); + // let to_skip = M::skip_quantize_tensors(); + // let mut saver = QuantizeSaver::new( + // quantization_target, + // &hyperparameters, + // &tensors, + // &to_quantize, + // &to_skip, + // reader, + // |p| progress_callback(p), + // ); + // ggml::format::ggml::save( + // writer, + // &mut saver, + // save_container_type, + // &tokenizer, + // &tensors.keys().cloned().collect::>(), + // ) + // .map_err(|err| QuantizeError::from_format_error(err, PathBuf::default()))?; + + // // Final report + // let sum_all: i64 = saver.history_all.iter().sum(); + // progress_callback(QuantizeProgress::Finished { + // original_size: saver.total_size_original, + // reduced_size: saver.total_size_new, + // history: saver + // .history_all + // .iter() + // .map(|hist| *hist as f32 / sum_all as f32) + // .collect(), + // }); + + // Ok(()) + + todo!("reimeplement for GGUF") } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -265,150 +263,150 @@ impl From for FileTypeFormat { } } -struct QuantizeSaver<'a, F: Fn(QuantizeProgress), H: Hyperparameters, R: BufRead + Seek> { - // Input - quantization_target: QuantizationTarget, - hyperparameters: &'a H, - tensors: &'a HashMap, - to_quantize: &'a [Regex], - to_skip: &'a [Regex], - source_reader: &'a mut R, - progress_callback: F, - - // Output - total_size_original: usize, - total_size_new: usize, - history_all: Vec, -} -impl<'a, F: Fn(QuantizeProgress), H: Hyperparameters, R: BufRead + Seek> - QuantizeSaver<'a, F, H, R> -{ - fn new( - quantization_target: QuantizationTarget, - hyperparameters: &'a H, - tensors: &'a HashMap, - to_quantize: &'a [Regex], - to_skip: &'a [Regex], - source_reader: &'a mut R, - progress_callback: F, - ) -> Self { - Self { - quantization_target, - hyperparameters, - tensors, - to_quantize, - to_skip, - source_reader, - progress_callback, - - total_size_original: 0, - total_size_new: 0, - history_all: vec![0; 16], - } - } -} -impl SaveHandler - for QuantizeSaver<'_, F, H, R> -{ - fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), QuantizeError> { - self.hyperparameters - .write_ggml(writer) - .map_err(QuantizeError::HyperparametersWriteError)?; - Ok(()) - } +// struct QuantizeSaver<'a, F: Fn(QuantizeProgress), H: Hyperparameters, R: BufRead + Seek> { +// // Input +// quantization_target: QuantizationTarget, +// hyperparameters: &'a H, +// tensors: &'a HashMap, +// to_quantize: &'a [Regex], +// to_skip: &'a [Regex], +// source_reader: &'a mut R, +// progress_callback: F, - fn tensor_data(&mut self, tensor_name: &str) -> Result { - let tensor = self.tensors.get(tensor_name).expect( - "tensor not found; should be impossible due to handler being populated from loader", - ); - - (self.progress_callback)(QuantizeProgress::TensorLoading { - name: tensor_name, - dims: tensor.dims, - n_elements: tensor.n_elements, - element_type: tensor.element_type, - }); - - // Quantize only 2D tensors - let quantize = tensor.n_dims == 2 - && self.to_quantize.iter().any(|re| re.is_match(tensor_name)) - && !self.to_skip.iter().any(|re| re.is_match(tensor_name)); - let raw_data = tensor.read_data(self.source_reader)?; - - if quantize && !matches!(tensor.element_type, ggml::Type::F32 | ggml::Type::F16) { - return Err(QuantizeError::UnsupportedElementType { - element_type: tensor.element_type, - }); - } +// // Output +// total_size_original: usize, +// total_size_new: usize, +// history_all: Vec, +// } +// impl<'a, F: Fn(QuantizeProgress), H: Hyperparameters, R: BufRead + Seek> +// QuantizeSaver<'a, F, H, R> +// { +// fn new( +// quantization_target: QuantizationTarget, +// hyperparameters: &'a H, +// tensors: &'a HashMap, +// to_quantize: &'a [Regex], +// to_skip: &'a [Regex], +// source_reader: &'a mut R, +// progress_callback: F, +// ) -> Self { +// Self { +// quantization_target, +// hyperparameters, +// tensors, +// to_quantize, +// to_skip, +// source_reader, +// progress_callback, - self.total_size_original += raw_data.len(); - - let (element_type, data) = if quantize { - (self.progress_callback)(QuantizeProgress::TensorQuantizing { name: tensor_name }); - - let data_f32: Vec = match tensor.element_type { - ggml::Type::F32 => raw_data - .chunks_exact(4) - .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) - .collect(), - ggml::Type::F16 => raw_data - .chunks_exact(2) - .map(|chunk| { - f16::from_bits(u16::from_le_bytes(chunk.try_into().unwrap())).to_f32() - }) - .collect(), - _ => unreachable!(), - }; - - let result = match self.quantization_target { - QuantizationTarget::Q4_0 => { - ggml::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0]) - } - QuantizationTarget::Q4_1 => { - ggml::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0]) - } - QuantizationTarget::Q5_0 => { - ggml::quantize_q5_0(&data_f32, tensor.n_elements, tensor.dims[0]) - } - QuantizationTarget::Q5_1 => { - ggml::quantize_q5_1(&data_f32, tensor.n_elements, tensor.dims[0]) - } - QuantizationTarget::Q8_0 => { - ggml::quantize_q8_0(&data_f32, tensor.n_elements, tensor.dims[0]) - } - }; - let new_data = result.output; - - let mut history_new = vec![]; - for (i, val) in result.history.iter().enumerate() { - self.history_all[i] += val; - history_new.push(*val as f32 / tensor.n_elements as f32); - } - - (self.progress_callback)(QuantizeProgress::TensorQuantized { - name: tensor_name, - original_size: raw_data.len(), - reduced_size: new_data.len(), - history: history_new, - }); - - self.total_size_new += new_data.len(); - - (self.quantization_target.into(), new_data) - } else { - (self.progress_callback)(QuantizeProgress::TensorSkipped { - name: tensor_name, - size: raw_data.len(), - }); - self.total_size_new += raw_data.len(); - (tensor.element_type, raw_data) - }; - - Ok(TensorSaveInfo { - n_dims: tensor.n_dims, - dims: tensor.dims, - element_type, - data, - }) - } -} +// total_size_original: 0, +// total_size_new: 0, +// history_all: vec![0; 16], +// } +// } +// } +// impl SaveHandler +// for QuantizeSaver<'_, F, H, R> +// { +// fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), QuantizeError> { +// self.hyperparameters +// .write_ggml(writer) +// .map_err(QuantizeError::HyperparametersWriteError)?; +// Ok(()) +// } + +// fn tensor_data(&mut self, tensor_name: &str) -> Result { +// let tensor = self.tensors.get(tensor_name).expect( +// "tensor not found; should be impossible due to handler being populated from loader", +// ); + +// (self.progress_callback)(QuantizeProgress::TensorLoading { +// name: tensor_name, +// dims: tensor.dims, +// n_elements: tensor.n_elements, +// element_type: tensor.element_type, +// }); + +// // Quantize only 2D tensors +// let quantize = tensor.n_dims == 2 +// && self.to_quantize.iter().any(|re| re.is_match(tensor_name)) +// && !self.to_skip.iter().any(|re| re.is_match(tensor_name)); +// let raw_data = tensor.read_data(self.source_reader)?; + +// if quantize && !matches!(tensor.element_type, ggml::Type::F32 | ggml::Type::F16) { +// return Err(QuantizeError::UnsupportedElementType { +// element_type: tensor.element_type, +// }); +// } + +// self.total_size_original += raw_data.len(); + +// let (element_type, data) = if quantize { +// (self.progress_callback)(QuantizeProgress::TensorQuantizing { name: tensor_name }); + +// let data_f32: Vec = match tensor.element_type { +// ggml::Type::F32 => raw_data +// .chunks_exact(4) +// .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) +// .collect(), +// ggml::Type::F16 => raw_data +// .chunks_exact(2) +// .map(|chunk| { +// f16::from_bits(u16::from_le_bytes(chunk.try_into().unwrap())).to_f32() +// }) +// .collect(), +// _ => unreachable!(), +// }; + +// let result = match self.quantization_target { +// QuantizationTarget::Q4_0 => { +// ggml::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0]) +// } +// QuantizationTarget::Q4_1 => { +// ggml::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0]) +// } +// QuantizationTarget::Q5_0 => { +// ggml::quantize_q5_0(&data_f32, tensor.n_elements, tensor.dims[0]) +// } +// QuantizationTarget::Q5_1 => { +// ggml::quantize_q5_1(&data_f32, tensor.n_elements, tensor.dims[0]) +// } +// QuantizationTarget::Q8_0 => { +// ggml::quantize_q8_0(&data_f32, tensor.n_elements, tensor.dims[0]) +// } +// }; +// let new_data = result.output; + +// let mut history_new = vec![]; +// for (i, val) in result.history.iter().enumerate() { +// self.history_all[i] += val; +// history_new.push(*val as f32 / tensor.n_elements as f32); +// } + +// (self.progress_callback)(QuantizeProgress::TensorQuantized { +// name: tensor_name, +// original_size: raw_data.len(), +// reduced_size: new_data.len(), +// history: history_new, +// }); + +// self.total_size_new += new_data.len(); + +// (self.quantization_target.into(), new_data) +// } else { +// (self.progress_callback)(QuantizeProgress::TensorSkipped { +// name: tensor_name, +// size: raw_data.len(), +// }); +// self.total_size_new += raw_data.len(); +// (tensor.element_type, raw_data) +// }; + +// Ok(TensorSaveInfo { +// n_dims: tensor.n_dims, +// dims: tensor.dims, +// element_type, +// data, +// }) +// } +// } diff --git a/crates/llm-base/src/tokenizer/embedded.rs b/crates/llm-base/src/tokenizer/embedded.rs index cf96b183..25387d23 100644 --- a/crates/llm-base/src/tokenizer/embedded.rs +++ b/crates/llm-base/src/tokenizer/embedded.rs @@ -1,7 +1,14 @@ -use std::collections::HashMap; +use std::{ + cmp::Ordering, + collections::{BinaryHeap, HashMap}, + str::FromStr, +}; +use ggml::format::gguf::{Metadata, MetadataError}; use thiserror::Error; +use crate::TokenizerLoadError; + use super::{Token, TokenId, TokenScore, TokenizationError}; #[derive(Debug, Error)] @@ -13,43 +20,113 @@ pub enum EmbeddedTokenizerError { } /// The built-in GGML tokenizer. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct EmbeddedTokenizer { /// Maps every integer (index) token ID to its corresponding token. - id_to_token: Vec, - - /// Maps every integer (index) token ID to corresponding score. - id_to_token_score: Vec, + id_to_token: Vec, // todo: use a radix tree /// Maps a token to a token ID. token_to_id: HashMap, - /// The longest token in this tokenizer. - max_token_length: usize, + model: GgufEmbeddedTokenizerModel, + bos_id: TokenId, + _eos_id: TokenId, + _unknown_id: TokenId, + linefeed_id: TokenId, + _separator_id: Option, + _padding_id: Option, +} +#[derive(Debug, Clone, Default)] +struct TokenData { + text: Token, + score: TokenScore, + ty: TokenType, } - impl EmbeddedTokenizer { - /// Add a token to the internal vocabulary. - /// - /// The token added must have `id` directly after the last token in the vocabulary. - /// - /// # Panics - /// - This function can panic if `id` does not correspond to the next token in the vocabulary. - /// That is, if there are already `n` tokens in the vocabulary, then `id` must be `n`. - pub(crate) fn push_token(&mut self, id: TokenId, content: Token, score: TokenScore) { - // These are loader invariants. If this is broken, then the loader is broken and this is a bug, - // not an issue with the model itself. - assert_eq!(self.id_to_token.len(), self.id_to_token_score.len()); - if self.id_to_token.len() != id as usize || self.id_to_token_score.len() != id as usize { - let expected_id = self.id_to_token.len() as TokenId; - panic!("the id of token added should be {expected_id}; is {id}"); - } + pub(crate) fn is_present_in_metadata(metadata: &Metadata) -> bool { + metadata.contains_key("tokenizer.ggml.scores") + } + + pub(crate) fn from_metadata(metadata: &Metadata) -> Result { + let tok = GgufEmbeddedTokenizer::from_metadata(metadata)?; + + let model = if let Some(model) = tok.model { + model + .parse::() + .expect("TODO: handle invalid tokenizer model") + } else { + GgufEmbeddedTokenizerModel::Llama + }; + + match model { + GgufEmbeddedTokenizerModel::Llama => { + let bos_id = metadata + .get_with_type("tokenizer.ggml.bos_token_id", |v| v.as_uint32()) + .unwrap_or(1); + let eos_id = metadata + .get_with_type("tokenizer.ggml.eos_token_id", |v| v.as_uint32()) + .unwrap_or(2); + let unknown_id = metadata + .get_with_type("tokenizer.ggml.unknown_token_id", |v| v.as_uint32()) + .unwrap_or(0); + let separator_id = metadata + .get_with_type("tokenizer.ggml.separator_token_id", |v| v.as_uint32()) + .ok(); + let padding_id = metadata + .get_with_type("tokenizer.ggml.padding_token_id", |v| v.as_uint32()) + .ok(); + + let tokens = metadata.get_array_with_type("tokenizer.ggml.tokens", |v| { + v.as_array()?.as_string_array() + })?; + let scores = metadata + .get_array_with_type("tokenizer.ggml.scores", |v| { + v.as_array()?.as_float32_array() + }) + .unwrap_or_default(); + let types = metadata + .get_array_with_type("tokenizer.ggml.token_type", |v| { + v.as_array()?.as_int32_array() + }) + .unwrap_or_default(); - self.max_token_length = self.max_token_length.max(content.len()); - self.id_to_token.push(content.clone()); - self.id_to_token_score.push(score); - self.token_to_id.insert(content, id); + let mut token_to_id = HashMap::default(); + let mut id_to_token = vec![TokenData::default(); tokens.len()]; + + for (i, token) in tokens.iter().enumerate() { + let word = token.as_bytes().to_vec(); + token_to_id.insert(word.clone(), i as TokenId); + id_to_token[i] = TokenData { + text: word.clone(), + score: scores.get(i).copied().unwrap_or(0.0), + ty: match types.get(i) { + Some(tok) => { + TokenType::try_from(*tok).expect("TODO: handle invalid token type") + } + None => TokenType::Normal, + }, + }; + } + + let mut tokenizer = EmbeddedTokenizer { + token_to_id, + id_to_token, + model: GgufEmbeddedTokenizerModel::Llama, + bos_id, + _eos_id: eos_id, + _unknown_id: unknown_id, + linefeed_id: 0, + _separator_id: separator_id, + _padding_id: padding_id, + }; + + tokenizer.linefeed_id = tokenizer.byte_to_token(b'\n'); + + Ok(tokenizer) + } + _ => unimplemented!(), + } } pub(crate) fn id(&self, token: &[u8]) -> Option { @@ -57,8 +134,8 @@ impl EmbeddedTokenizer { } /// Converts a token index to the token it represents in this tokenizer. - pub(crate) fn token(&self, idx: usize) -> Vec { - self.id_to_token[idx].clone() + pub(crate) fn token(&self, idx: usize) -> Token { + self.id_to_token[idx].text.clone() } /// Returns the number of tokens in the tokenizer. @@ -71,7 +148,6 @@ impl EmbeddedTokenizer { self.id_to_token.is_empty() } - // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece /// Tokenize a `text` with this tokenizer. /// /// `bos` controls whether a beginning-of-string token should be inserted. @@ -79,79 +155,368 @@ impl EmbeddedTokenizer { &self, text: &str, bos: bool, - ) -> Result, TokenId)>, TokenizationError> { - let len = text.len(); - - let mut score = vec![0usize; len + 1]; - let mut prev = vec![TokenId::default(); len + 1]; - - for i in 0..len { - let max_len = (len - i).min(self.max_token_length); - for sub_len in 1..=max_len { - let sub = &text.as_bytes()[i..i + sub_len]; - let token = self.token_to_id.get(sub); - - if let Some(token) = token { - let token_score = sub.len() * sub.len(); - let local_score = score[i] + token_score; - let next = i + sub_len; - - if score[next] < local_score { - score[next] = local_score; - prev[next] = *token; + ) -> Result, TokenizationError> { + let mut output = vec![]; + + if bos { + output.push(( + self.id_to_token[self.bos_id as usize].text.clone(), + self.bos_id, + )); + } + + if text.is_empty() { + return Ok(output); + } + + match self.model { + GgufEmbeddedTokenizerModel::Llama => { + let text = escape_whitespace(format!(" {text}").as_bytes()); + + Ok(TokenizerSpm::new(self) + .tokenize(&text) + .into_iter() + .map(|id| { + // TODO: see if this can be made more efficient + (self.id_to_token[id as usize].text.clone(), id) + }) + .collect()) + } + _ => unimplemented!(), + } + } + + /// Decode a list `tokens` with this tokenizer. + pub(crate) fn decode(&self, tokens: Vec, _skip_special_tokens: bool) -> Vec { + let mut ret = vec![]; + + match self.model { + GgufEmbeddedTokenizerModel::Llama => { + for token_id in tokens { + let token = &self.id_to_token[token_id as usize]; + match token.ty { + TokenType::Normal => { + ret.append(&mut unescape_whitespace(&token.text)); + } + TokenType::Unknown => { + assert_eq!(token.text.len(), 3); + ret.extend_from_slice(&[0xE2, 0x96, 0x85]); + } + TokenType::Byte => { + ret.push(self.token_to_byte(token_id)); + } + TokenType::Control | TokenType::UserDefined | TokenType::Unused => {} } } } + _ => unimplemented!(), } - // Backward pass - let mut res = vec![]; - let mut i = len; - while i > 0 { - let token_id = prev[i]; - if token_id == 0 { - return Err(TokenizationError::TokenizationFailed { - error: Box::new(EmbeddedTokenizerError::Arbitrary( - "the backward pass for the tokenizer encountered a non-set token" - .to_string(), - )), - }); + ret + } +} +impl EmbeddedTokenizer { + fn byte_to_token(&self, ch: u8) -> TokenId { + let token = format!("<0x{ch:02X}>"); + self.token_to_id.get(token.as_bytes()).copied().unwrap() + } + + fn token_to_byte(&self, token_id: TokenId) -> u8 { + let data = &self.id_to_token[token_id as usize]; + assert_eq!(data.ty, TokenType::Byte); + + match self.model { + GgufEmbeddedTokenizerModel::Llama => { + u8::from_str_radix(std::str::from_utf8(&data.text[3..5]).unwrap(), 16).unwrap() } - let token = self.id_to_token[token_id as usize].as_slice(); - res.push((token.to_vec(), token_id)); - i -= token.len(); + _ => unimplemented!(), } + } +} - if bos { - // TODO: replace with vocab.bos - res.push((vec![], 1)); +/// An embedded tokenizer definition in a GGUF. +pub struct GgufEmbeddedTokenizer<'a> { + /// The model type. + pub model: Option<&'a str>, + /// The tokens. + pub tokens: &'a [String], + /// The token scores. + pub scores: &'a [f32], + /// The token types. + pub types: Option<&'a [u32]>, +} +impl GgufEmbeddedTokenizer<'_> { + /// Attempt to retrieve the embedded tokenizer from the metadata. + pub fn from_metadata(metadata: &Metadata) -> Result { + Ok(GgufEmbeddedTokenizer { + model: metadata + .get_optional("tokenizer.ggml.model") + .and_then(|v| v.as_string()), + tokens: metadata.get_array_with_type("tokenizer.ggml.tokens", |v| { + v.as_array()?.as_string_array() + })?, + scores: metadata.get_array_with_type("tokenizer.ggml.scores", |v| { + v.as_array()?.as_float32_array() + })?, + types: metadata + .get_array_with_type("tokenizer.ggml.token_type", |v| { + v.as_array()?.as_uint32_array() + }) + .ok(), + }) + } +} + +/// Typesafe tokenizer models. +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub enum GgufEmbeddedTokenizerModel { + /// Llama style SentencePiece (tokens and scores extracted from HF `tokenizer.model`) + Llama, + /// Replit style SentencePiece (tokens and scores extracted from HF `spiece.model`) + Replit, + /// GPT-2 / GPT-NeoX style BPE (tokens extracted from HF `tokenizer.json`) + Gpt2, + /// RWKV tokenizer + Rwkv, +} +impl FromStr for GgufEmbeddedTokenizerModel { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "llama" => Ok(Self::Llama), + "replit" => Ok(Self::Replit), + "gpt2" => Ok(Self::Gpt2), + "rwkv" => Ok(Self::Rwkv), + other => Err(other.to_string()), } + } +} - // Pieces are in reverse order so correct that - res.reverse(); +/// The type of a token. +#[allow(missing_docs)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)] +pub enum TokenType { + #[default] + Normal, + Unknown, + Control, + UserDefined, + Unused, + Byte, +} +impl TryFrom for TokenType { + type Error = i32; - Ok(res) + fn try_from(value: i32) -> Result { + match value { + 1 => Ok(Self::Normal), + 2 => Ok(Self::Unknown), + 3 => Ok(Self::Control), + 4 => Ok(Self::UserDefined), + 5 => Ok(Self::Unused), + 6 => Ok(Self::Byte), + other => Err(other), + } } +} - /// Decode a list `tokens` with this tokenizer. - pub(crate) fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { - let mut vec = vec![]; +#[derive(Clone)] +struct Symbol { + prev: isize, + next: isize, + text: Vec, + n: usize, +} + +struct LlmBigramSpm { + left: isize, + right: isize, + score: f32, + size: usize, +} +impl PartialOrd for LlmBigramSpm { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl Ord for LlmBigramSpm { + fn cmp(&self, other: &Self) -> Ordering { + self.score + .partial_cmp(&other.score) + .unwrap_or(Ordering::Equal) + .then_with(|| other.left.cmp(&self.left)) + } +} + +impl PartialEq for LlmBigramSpm { + fn eq(&self, other: &Self) -> bool { + self.score == other.score && self.left == other.left + } +} + +impl Eq for LlmBigramSpm {} - for token in tokens { - if skip_special_tokens && token == 1 { +struct TokenizerSpm<'a> { + vocab: &'a EmbeddedTokenizer, + symbols: Vec, + work_queue: BinaryHeap, + rev_merge: HashMap, +} + +impl<'a> TokenizerSpm<'a> { + fn new(vocab: &'a EmbeddedTokenizer) -> Self { + Self { + vocab, + symbols: Vec::new(), + work_queue: BinaryHeap::new(), + rev_merge: HashMap::new(), + } + } + + fn tokenize(&mut self, text: &[u8]) -> Vec { + let mut output = vec![]; + let mut index = 0; + let mut offs = 0; + while offs < text.len() { + let len = text[offs..].len(); + let sym = Symbol { + text: text[offs..offs + len].to_vec(), + n: len.min(text.len() - offs), + prev: index - 1, + next: if offs + len == text.len() { + -1 + } else { + index + 1 + }, + }; + offs += sym.n; + index += 1; + self.symbols.push(sym); + } + + for i in 1..self.symbols.len() { + self.try_add_bigram((i - 1) as isize, i as isize); + } + + while let Some(bigram) = self.work_queue.pop() { + let mut left_sym = self.symbols[bigram.left as usize].clone(); + let mut right_sym = self.symbols[bigram.right as usize].clone(); + + if left_sym.n == 0 || right_sym.n == 0 || left_sym.n + right_sym.n != bigram.size { continue; } - vec.append(&mut self.id_to_token[token as usize].to_vec()); + left_sym.n += right_sym.n; + right_sym.n = 0; + + left_sym.next = right_sym.next; + if right_sym.next >= 0 { + self.symbols[right_sym.next as usize].prev = bigram.left; + } + + let left_sym_prev = left_sym.prev; + let left_sym_next = left_sym.next; + + self.symbols[bigram.left as usize] = left_sym; + self.symbols[bigram.right as usize] = right_sym; + + self.try_add_bigram(left_sym_prev, bigram.left); + self.try_add_bigram(bigram.left, left_sym_next); + } + + let mut i = 0; + while i != -1 { + let symbol = &self.symbols[i as usize]; + self.resegment(symbol, &mut output); + i = symbol.next; + } + output + } + + fn resegment(&self, symbol: &Symbol, output: &mut Vec) { + let text = symbol.text.clone(); + if let Some(&token_id) = self.vocab.token_to_id.get(&text) { + output.push(token_id); + return; + } + + if let Some(&(left, right)) = self.rev_merge.get(&text) { + self.resegment(&self.symbols[left as usize], output); + self.resegment(&self.symbols[right as usize], output); + } else { + for &ch in &text { + let token_id = self.vocab.byte_to_token(ch); + output.push(token_id); + } + } + } + + fn try_add_bigram(&mut self, left: isize, right: isize) { + if left == -1 || right == -1 { + return; } - vec + let text = [ + self.symbols[left as usize].text.clone(), + self.symbols[right as usize].text.clone(), + ] + .concat(); + if let Some(&token_id) = self.vocab.token_to_id.get(&text) { + if (token_id as usize) < self.vocab.id_to_token.len() { + let tok_data = &self.vocab.id_to_token[token_id as usize]; + let bigram = LlmBigramSpm { + left, + right, + score: tok_data.score, + size: text.len(), + }; + self.work_queue.push(bigram); + self.rev_merge.insert(text, (left, right)); + } + } } +} + +fn escape_whitespace(text: &[u8]) -> Vec { + let mut out = vec![]; - pub(crate) fn iter(&self) -> impl Iterator + '_ { - self.id_to_token - .iter() - .zip(self.id_to_token_score.iter()) - .map(|(token, score)| (token.clone(), *score)) + for &b in text { + if b == b' ' { + out.extend_from_slice(&[0xE2, 0x96, 0x81]); + } else { + out.push(b); + } } + + out +} + +fn unescape_whitespace(text: &[u8]) -> Vec { + let mut out = vec![]; + let mut buffer: Vec = vec![]; + + for &b in text { + #[allow(clippy::if_same_then_else)] + if b == 0xE2 { + // If the current byte is 0xE2, start buffering and check for the sequence. + buffer.push(b); + } else if buffer.len() == 1 && b == 0x96 { + // If the previous byte was 0xE2 and the current byte is 0x96, continue buffering. + buffer.push(b); + } else if buffer.len() == 2 && b == 0x81 { + // If the previous bytes were 0xE2 and 0x96 and the current byte is 0x81, replace with space and reset buffer. + out.push(b' '); + buffer.clear(); + } else { + // If no match, flush the buffer and append the current byte. + out.append(&mut buffer); + out.push(b); + } + } + + // If there are any remaining bytes in the buffer, append them. + out.append(&mut buffer); + + out } diff --git a/crates/llm-base/src/tokenizer/mod.rs b/crates/llm-base/src/tokenizer/mod.rs index 03b2f0b9..9852993e 100644 --- a/crates/llm-base/src/tokenizer/mod.rs +++ b/crates/llm-base/src/tokenizer/mod.rs @@ -1,16 +1,15 @@ -use std::{ - error::Error, - fmt::Display, - path::{Path, PathBuf}, - str::FromStr, -}; +//! Tokenizer-related functionality. +use std::{error::Error, fmt::Display, path::PathBuf, str::FromStr}; + +use ggml::format::gguf::{Gguf, MetadataError}; use thiserror::Error; mod embedded; pub use embedded::*; mod huggingface; pub use huggingface::*; +pub use tokenizers as huggingface_tokenizers; /// The identifier of a token in a tokenizer. pub type TokenId = u32; @@ -20,7 +19,7 @@ pub(crate) type TokenScore = f32; #[derive(Error, Debug)] /// Errors related to tokenization. pub enum TokenizationError { - #[error("an invalid token was encountered during tokenization")] + #[error("an invalid token was encountered during tokenization: {error}")] /// During tokenization, one of the produced tokens was invalid / zero. TokenizationFailed { #[source] @@ -35,22 +34,55 @@ pub enum TokenizationError { #[derive(Error, Debug)] /// Errors related to loading the tokenizer. #[error("error loading tokenizer from {path}: {error}")] -pub struct TokenizerLoadError { - /// The path to the tokenizer. - pub path: PathBuf, - /// The error that occurred during loading. - pub error: Box, +pub enum TokenizerLoadError { + #[error("error loading Hugging Face tokenizer from {tokenizer_source}: {error}")] + /// An error occurred while loading a Hugging Face tokenizer. + HuggingFaceTokenizerError { + /// The source of the tokenizer that failed. + tokenizer_source: HuggingFaceTokenizerErrorSource, + /// The error that occurred during loading. + error: Box, + }, + #[error("no supported tokenizers were found, including in the model file: {unsupported_tokenizers:?}")] + /// No supported tokenizers were found, including in the model file. + NoSupportedTokenizersFound { + /// The list of tokenizers that were found, but not supported. + unsupported_tokenizers: Vec, + }, + #[error("{0}")] + /// An error occured with retrieving data from the metadata. + MetadataError(#[from] MetadataError), } -impl TokenizerLoadError { - fn new(path: impl Into, error: impl Into>) -> Self { - Self { - path: path.into(), - error: error.into(), +/// Used to identify where the tokenizer that errored came from. +// NOTE: We could potentially reuse `TokenizerSource` for this, but I want to avoid +// cloning and/or displaying the entire `String` case. Revisit in future and see if +// I still feel the same. +#[derive(Debug)] +pub enum HuggingFaceTokenizerErrorSource { + /// The tokenizer was loaded from this file. + File(PathBuf), + /// The tokenizer was loaded from thep rovided string. + String, + #[cfg(feature = "tokenizers-remote")] + /// The tokenizer was loaded from the given HF ID. + Remote(String), +} +impl Display for HuggingFaceTokenizerErrorSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::File(file) => write!(f, "file {file:?}"), + Self::String => write!(f, "string"), + #[cfg(feature = "tokenizers-remote")] + Self::Remote(remote) => write!(f, "HF ID {remote:?}"), } } } +/// At the time of writing, the embedded tokenizer is not enabled as it has +/// some bugs. We're just not enabling the option while it's broken. +const EMBEDDED_TOKENIZER_ENABLED: bool = false; + #[derive(Clone, Debug, PartialEq)] /// The source of a tokenizer. pub enum TokenizerSource { @@ -76,35 +108,66 @@ impl TokenizerSource { /// Retrieve the tokenizer from the source. /// /// Note that this may make a blocking HTTP request to Hugging Face to retrieve the tokenizer. - /// if `self` is [`Self::HuggingFaceRemote`]. - pub fn retrieve(self, model_path: &Path) -> Result { - let _ = model_path; - - Ok(match self { + /// if `self` is `Self::HuggingFaceRemote`. + pub fn retrieve(self, gguf: &Gguf) -> Result { + match self { #[cfg(feature = "tokenizers-remote")] - Self::HuggingFaceRemote(identifier) => HuggingFaceTokenizer::new( - tokenizers::Tokenizer::from_pretrained(&identifier, None) - .map_err(|error| TokenizerLoadError::new(model_path, error))?, + Self::HuggingFaceRemote(identifier) => Ok(HuggingFaceTokenizer::new( + tokenizers::Tokenizer::from_pretrained(&identifier, None).map_err(|error| { + TokenizerLoadError::HuggingFaceTokenizerError { + tokenizer_source: HuggingFaceTokenizerErrorSource::Remote( + identifier.clone(), + ), + error, + } + })?, ) - .into(), - - Self::HuggingFaceTokenizerFile(path) => HuggingFaceTokenizer::new( - tokenizers::Tokenizer::from_file(&path) - .map_err(|error| TokenizerLoadError::new(path, error))?, + .into()), + + Self::HuggingFaceTokenizerFile(path) => Ok(HuggingFaceTokenizer::new( + tokenizers::Tokenizer::from_file(&path).map_err(|error| { + TokenizerLoadError::HuggingFaceTokenizerError { + tokenizer_source: HuggingFaceTokenizerErrorSource::File(path.clone()), + error, + } + })?, ) - .into(), + .into()), + + Self::HuggingFaceTokenizerString(s) => Ok(Self::load_huggingface_json(&s)?), + + Self::Embedded => { + if let Ok(hf) = gguf.metadata.get_str("tokenizer.huggingface.json") { + Ok(Self::load_huggingface_json(hf)?) + } else if EmbeddedTokenizer::is_present_in_metadata(&gguf.metadata) { + if EMBEDDED_TOKENIZER_ENABLED { + Ok(EmbeddedTokenizer::from_metadata(&gguf.metadata)?.into()) + } else { + Err(TokenizerLoadError::NoSupportedTokenizersFound { + unsupported_tokenizers: vec!["embedded".to_owned()], + }) + } + } else { + Err(TokenizerLoadError::NoSupportedTokenizersFound { + unsupported_tokenizers: vec![], + }) + } + } + } + } - Self::HuggingFaceTokenizerString(s) => HuggingFaceTokenizer::new( - tokenizers::Tokenizer::from_str(&s) - .map_err(|error| TokenizerLoadError::new(model_path, error))?, - ) + fn load_huggingface_json(tokenizer_json: &str) -> Result { + Ok( + HuggingFaceTokenizer::new(tokenizers::Tokenizer::from_str(tokenizer_json).map_err( + |error| TokenizerLoadError::HuggingFaceTokenizerError { + tokenizer_source: HuggingFaceTokenizerErrorSource::String, + error, + }, + )?) .into(), - - Self::Embedded => EmbeddedTokenizer::default().into(), - }) + ) } } - /// Encapsulates the tokenizer for a model, and provides methods to tokenize text. pub enum Tokenizer { /// The vocabulary built-in to the model. @@ -123,13 +186,6 @@ impl From for Tokenizer { Self::HuggingFace(v) } } -impl Tokenizer { - /// Creates an empty embedded tokenizer, for contexts where you need a tokenizer but don't - /// need to tokenize anything. - pub(crate) fn empty_embedded() -> Self { - Self::Embedded(EmbeddedTokenizer::default()) - } -} impl Tokenizer { /// Converts a token to the token ID it represents in this tokenizer. pub fn id(&self, token: &[u8]) -> Option { diff --git a/crates/llm-base/src/util.rs b/crates/llm-base/src/util.rs index 70fe2994..586c5b6f 100644 --- a/crates/llm-base/src/util.rs +++ b/crates/llm-base/src/util.rs @@ -1,11 +1,6 @@ //! Utilities for interacting with LLMs and loading them. pub use ggml::util::*; -use std::{ - io::BufRead, - path::{Path, PathBuf}, -}; - /// NOTE: The original code relies in promotion rules and automatic cast between /// int to float. What we do instead is use this macro to convert every term of /// the multiplication to f64, which should have enough precision bits to hold @@ -22,15 +17,6 @@ macro_rules! mulf { } use memmap2::{Mmap, MmapAsRawDesc, MmapOptions}; -use thiserror::Error; - -use crate::{FileType, LoadError}; - -/// Read the filetype from a reader. -pub fn read_filetype(reader: &mut dyn BufRead) -> Result { - let ftype = read_u32(reader)?; - FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype)) -} /// Used to buffer incoming tokens until they produce a valid string of UTF-8 text. /// @@ -73,67 +59,6 @@ impl TokenUtf8Buffer { } } -#[derive(Error, Debug)] -/// Errors encountered during the loading process. -pub enum FindAllModelFilesError { - #[error("no parent path for {path:?}")] - /// There is no parent path for a given path. - NoParentPath { - /// The path without a parent. - path: PathBuf, - }, - #[error("non-specific I/O error")] - /// A non-specific IO error. - IO(#[from] std::io::Error), -} - -/// Find all the files related to a model. -pub fn find_all_model_files(main_path: &Path) -> Result, FindAllModelFilesError> { - let mut main_path_parent = - main_path - .parent() - .ok_or_else(|| FindAllModelFilesError::NoParentPath { - path: main_path.to_owned(), - })?; - if main_path_parent.to_str() == Some("") { - main_path_parent = Path::new("."); - } - Ok(collect_related_paths( - main_path, - std::fs::read_dir(main_path_parent)? - .filter_map(Result::ok) - .map(|de| de.path()), - )) -} - -fn collect_related_paths( - main_path: &Path, - directory_paths: impl Iterator, -) -> Vec { - let main_filename = main_path.file_name().and_then(|p| p.to_str()); - - let mut paths: Vec = directory_paths - .filter(|p| { - p.file_name() - .and_then(|p| p.to_str()) - .zip(main_filename) - .map_or(false, |(part_filename, main_filename)| match part_filename - .strip_prefix(main_filename) - { - Some(suffix) => { - suffix.is_empty() - || (suffix - .strip_prefix('.') - .map_or(false, |s| s.parse::().is_ok())) - } - None => false, - }) - }) - .collect(); - paths.sort(); - paths -} - /// mmap with MAP_POPULATE pub fn mmap_populate(file: T) -> Result { unsafe { MmapOptions::new().populate().map(file) } @@ -154,27 +79,6 @@ pub fn softmax(logits: &[f32]) -> Vec { mod tests { use super::*; - #[test] - fn test_collect_related_paths() { - let main_path = PathBuf::from("/models/llama.bin"); - let directory_paths = [ - "/models/llama.bin", - "/models/llama.bin.1", - "/models/llama.bin.2", - "/models/llama.bin.tmp", - ] - .map(PathBuf::from); - let expected_paths = [ - "/models/llama.bin", - "/models/llama.bin.1", - "/models/llama.bin.2", - ] - .map(PathBuf::from); - - let output_paths = collect_related_paths(&main_path, directory_paths.into_iter()); - assert_eq!(expected_paths.as_slice(), output_paths); - } - #[test] fn test_valid_utf8() { let mut buffer = TokenUtf8Buffer::new(); diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index efff39e5..159950f8 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -20,6 +20,7 @@ llm-bert = { path = "../models/bert", optional = true, version = "0.2.0-dev" } serde = { workspace = true } tracing = { workspace = true } +thiserror = { workspace = true } [dev-dependencies] bytesize = { workspace = true } @@ -35,7 +36,7 @@ default = ["models", "tokenizers-remote"] tokenizers-remote = ["llm-base/tokenizers-remote"] -models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "bert"] +models = ["llama", "gptneox", "gpt2", "gptj", "bloom", "mpt", "bert"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] gptj = ["dep:llm-gptj"] diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs index 427c9e48..64fc4009 100644 --- a/crates/llm/examples/embeddings.rs +++ b/crates/llm/examples/embeddings.rs @@ -4,7 +4,6 @@ use clap::Parser; #[derive(Parser)] struct Args { - model_architecture: llm::ModelArchitecture, model_path: PathBuf, #[arg(long, short = 'v')] pub tokenizer_path: Option, @@ -32,7 +31,6 @@ fn main() { let args = Args::parse(); let tokenizer_source = args.to_tokenizer_source(); - let model_architecture = args.model_architecture; let model_path = args.model_path; let query = args .query @@ -50,16 +48,13 @@ fn main() { // Load model let model_params = llm::ModelParameters::default(); - let model = llm::load_dynamic( - Some(model_architecture), + let model = llm::load( &model_path, tokenizer_source, model_params, llm::load_progress_callback_stdout, ) - .unwrap_or_else(|err| { - panic!("Failed to load {model_architecture} model from {model_path:?}: {err}") - }); + .unwrap_or_else(|err| panic!("Failed to load model from {model_path:?}: {err}")); let inference_parameters = llm::InferenceParameters::default(); // Generate embeddings for query and comparands @@ -109,7 +104,7 @@ fn main() { fn get_embeddings( model: &dyn llm::Model, - inference_parameters: &llm::InferenceParameters, + _inference_parameters: &llm::InferenceParameters, query: &str, ) -> Vec { let mut session = model.start_session(Default::default()); diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs index 51e7369a..c3ffcb02 100644 --- a/crates/llm/examples/inference.rs +++ b/crates/llm/examples/inference.rs @@ -3,7 +3,6 @@ use std::{convert::Infallible, io::Write, path::PathBuf}; #[derive(Parser)] struct Args { - model_architecture: llm::ModelArchitecture, model_path: PathBuf, #[arg(long, short = 'p')] prompt: Option, @@ -29,7 +28,6 @@ fn main() { let args = Args::parse(); let tokenizer_source = args.to_tokenizer_source(); - let model_architecture = args.model_architecture; let model_path = args.model_path; let prompt = args .prompt @@ -38,16 +36,13 @@ fn main() { let now = std::time::Instant::now(); - let model = llm::load_dynamic( - Some(model_architecture), + let model = llm::load( &model_path, tokenizer_source, Default::default(), llm::load_progress_callback_stdout, ) - .unwrap_or_else(|err| { - panic!("Failed to load {model_architecture} model from {model_path:?}: {err}") - }); + .unwrap_or_else(|err| panic!("Failed to load model from {model_path:?}: {err}")); println!( "Model fully loaded! Elapsed: {}ms", diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs index 4ced1ef2..1efb088e 100644 --- a/crates/llm/examples/vicuna-chat.rs +++ b/crates/llm/examples/vicuna-chat.rs @@ -5,7 +5,6 @@ use std::{convert::Infallible, io::Write, path::PathBuf}; #[derive(Parser)] struct Args { - model_architecture: llm::ModelArchitecture, model_path: PathBuf, #[arg(long, short = 'v')] pub tokenizer_path: Option, @@ -29,18 +28,14 @@ fn main() { let args = Args::parse(); let tokenizer_source = args.to_tokenizer_source(); - let model_architecture = args.model_architecture; let model_path = args.model_path; - let model = llm::load_dynamic( - Some(model_architecture), + let model = llm::load( &model_path, tokenizer_source, Default::default(), llm::load_progress_callback_stdout, ) - .unwrap_or_else(|err| { - panic!("Failed to load {model_architecture} model from {model_path:?}: {err}") - }); + .unwrap_or_else(|err| panic!("Failed to load model from {model_path:?}: {err}")); let mut session = model.start_session(Default::default()); diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 14800686..39069f06 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -7,6 +7,7 @@ //! - [GPT-NeoX](llm_gptneox) //! - [LLaMA](llm_llama) //! - [MPT](llm_mpt) +//! - [BERT](llm_bert) //! - Falcon (currently disabled due to incompleteness) //! //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to @@ -19,7 +20,7 @@ //! use llm::Model; //! //! // load a GGML model from disk -//! let llama = llm::load::( +//! let llama = llm::load( //! // path to GGML file //! std::path::Path::new("/path/to/model"), //! // llm::TokenizerSource @@ -35,7 +36,7 @@ //! let mut session = llama.start_session(Default::default()); //! let res = session.infer::( //! // model to use for text generation -//! &llama, +//! llama.as_ref(), //! // randomness provider //! &mut rand::thread_rng(), //! // the prompt to use for text generation, as well as other @@ -70,7 +71,6 @@ use std::{ error::Error, fmt::{Debug, Display}, - path::Path, str::FromStr, }; @@ -80,20 +80,22 @@ pub use llm_base::{ conversation_inference_callback, feed_prompt_callback, ggml::accelerator::get_accelerator as ggml_get_accelerator, ggml::accelerator::Accelerator as GgmlAccelerator, ggml::format as ggml_format, - ggml::RoPEOverrides, load, load_progress_callback_stdout, quantize, samplers, ElementType, - FileType, FileTypeFormat, FormatMagic, Hyperparameters, InferenceError, InferenceFeedback, - InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession, - InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, - InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, + ggml::RoPEOverrides, quantize, samplers, tokenizer, ElementType, FileMagic, FileType, + FileTypeFormat, InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest, + InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, + InferenceSnapshotRef, InferenceStats, InvalidTokenBias, Model, ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, RewindError, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer, TokenizerSource, }; +mod loader; +pub use loader::{load, load_progress_callback_stdout, LoadError, LoadProgress}; + use serde::Serialize; macro_rules! define_models { - ($(($model_lowercase:ident, $model_lowercase_str:literal, $model_pascalcase:ident, $krate_ident:ident, $display_name:literal)),*) => { + ($(($model_lowercase:ident, $model_lowercase_str:literal, $model_pascalcase:ident, $krate_ident:ident, $display_name:literal),)*) => { /// All available models. pub mod models { $( @@ -124,7 +126,7 @@ macro_rules! define_models { impl ModelArchitecture { /// Use a visitor to dispatch some code based on the model architecture. - pub fn visit(&self, visitor: &mut impl ModelArchitectureVisitor) -> R { + pub fn visit(&self, visitor: impl ModelArchitectureVisitor) -> R { match self { $( #[cfg(feature = $model_lowercase_str)] @@ -172,24 +174,24 @@ macro_rules! define_models { } define_models!( - (bert, "bert", Bert, llm_bert, "Bert"), - (bloom, "bloom", Bloom, llm_bloom, "BLOOM"), - (gpt2, "gpt2", Gpt2, llm_gpt2, "GPT-2"), - (gptj, "gptj", GptJ, llm_gptj, "GPT-J"), + // (bert, "bert", Bert, llm_bert, "Bert"), + // (bloom, "bloom", Bloom, llm_bloom, "BLOOM"), + // (gpt2, "gpt2", Gpt2, llm_gpt2, "GPT-2"), + // (gptj, "gptj", GptJ, llm_gptj, "GPT-J"), (gptneox, "gptneox", GptNeoX, llm_gptneox, "GPT-NeoX"), (llama, "llama", Llama, llm_llama, "LLaMA"), - (mpt, "mpt", Mpt, llm_mpt, "MPT"), - (falcon, "falcon", Falcon, llm_falcon, "Falcon") + // (mpt, "mpt", Mpt, llm_mpt, "MPT"), + // (falcon, "falcon", Falcon, llm_falcon, "Falcon"), ); /// Used to dispatch some code based on the model architecture. pub trait ModelArchitectureVisitor { /// Visit a model architecture. - fn visit(&mut self) -> R; + fn visit(self) -> R; } /// An unsupported model architecture was specified. -pub struct UnsupportedModelArchitecture(String); +pub struct UnsupportedModelArchitecture(pub String); impl Display for UnsupportedModelArchitecture { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) @@ -204,66 +206,6 @@ impl Debug for UnsupportedModelArchitecture { } } -/// A helper function that loads the specified model from disk using an architecture -/// specified at runtime. If no architecture is specified, it will try to infer it -/// from the model's metadata. -/// -/// This method returns a [`Box`], which means that the model will have single ownership. -/// If you'd like to share ownership (i.e. to use the model in multiple threads), we -/// suggest using [`Arc::from(Box)`](https://doc.rust-lang.org/std/sync/struct.Arc.html#impl-From%3CBox%3CT,+Global%3E%3E-for-Arc%3CT%3E) -/// to convert the [`Box`] into an [`Arc`](std::sync::Arc) after loading. -pub fn load_dynamic( - architecture: Option, - path: &Path, - tokenizer_source: TokenizerSource, - params: ModelParameters, - load_progress_callback: impl FnMut(LoadProgress), -) -> Result, LoadError> { - fn load_model( - path: &Path, - tokenizer_source: TokenizerSource, - params: ModelParameters, - load_progress_callback: impl FnMut(LoadProgress), - ) -> Result, LoadError> { - Ok(Box::new(load::( - path, - tokenizer_source, - params, - load_progress_callback, - )?)) - } - - let architecture = architecture.ok_or_else(|| LoadError::MissingModelArchitecture { - path: path.to_owned(), - })?; - - struct LoadVisitor<'a, F: FnMut(LoadProgress)> { - path: &'a Path, - tokenizer_source: TokenizerSource, - params: ModelParameters, - load_progress_callback: F, - } - impl<'a, F: FnMut(LoadProgress)> ModelArchitectureVisitor, LoadError>> - for LoadVisitor<'a, F> - { - fn visit(&mut self) -> Result, LoadError> { - load_model::( - self.path, - self.tokenizer_source.clone(), - self.params.clone(), - &mut self.load_progress_callback, - ) - } - } - - architecture.visit(&mut LoadVisitor { - path, - tokenizer_source, - params, - load_progress_callback, - }) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/llm/src/loader.rs b/crates/llm/src/loader.rs new file mode 100644 index 00000000..dd75fd4a --- /dev/null +++ b/crates/llm/src/loader.rs @@ -0,0 +1,55 @@ +use std::path::Path; + +pub use llm_base::loader::{load_progress_callback_stdout, LoadError, LoadProgress}; +use llm_base::{ + loader::{ModelFactory, ModelLoadCallback}, + model::{ModelLoadArgs, ModelLoadError}, + Model, ModelParameters, TokenizerSource, +}; + +use crate::{ModelArchitecture, ModelArchitectureVisitor}; + +/// Loads the specified GGUF model from disk, determining its architecture from the metadata, +/// and loading it with one of the supported modules. If you want to load a custom model, +/// consider using [llm_base::loader::load] directly. +/// +/// This method returns a [`Box`], which means that the model will have single ownership. +/// If you'd like to share ownership (i.e. to use the model in multiple threads), we +/// suggest using [`Arc::from(Box)`](https://doc.rust-lang.org/std/sync/struct.Arc.html#impl-From%3CBox%3CT,+Global%3E%3E-for-Arc%3CT%3E) +/// to convert the [`Box`] into an [`Arc`](std::sync::Arc) after loading. +pub fn load( + path: &Path, + tokenizer_source: TokenizerSource, + params: ModelParameters, + load_progress_callback: impl FnMut(LoadProgress), +) -> Result, LoadError> { + llm_base::loader::load( + path, + tokenizer_source, + params, + VisitorModelFactory, + load_progress_callback, + ) +} + +struct VisitorModelFactory; +impl ModelFactory for VisitorModelFactory { + fn load(&self, architecture: &str) -> Option { + let architecture = architecture.parse::().ok()?; + Some(architecture.visit(VisitorModelFactoryVisitor)) + } +} + +struct VisitorModelFactoryVisitor; +impl ModelArchitectureVisitor for VisitorModelFactoryVisitor { + fn visit(self) -> ModelLoadCallback { + Self::new_for_model:: + } +} +impl VisitorModelFactoryVisitor { + fn new_for_model( + args: ModelLoadArgs, + ) -> Result, ModelLoadError> { + Ok(M::new(args).map(Box::new)?) + } +} diff --git a/crates/models/bert/src/lib.rs b/crates/models/bert/src/lib.rs index 9a8daf6e..b9bf1c63 100644 --- a/crates/models/bert/src/lib.rs +++ b/crates/models/bert/src/lib.rs @@ -1,464 +1,464 @@ -//! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem. -#![deny(missing_docs)] - -use std::error::Error; - -use llm_base::{ - ggml, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, -}; - -/// The BERT model. -/// -/// # Safety -/// This implements [Send] and [Sync] as it is immutable after construction. -pub struct Bert { - params: ModelParameters, - hyperparameters: Hyperparameters, - tokenizer: Tokenizer, - - word_embeddings: ggml::Tensor, - token_type_embeddings: ggml::Tensor, - position_embeddings: ggml::Tensor, - ln_e_w: ggml::Tensor, - ln_e_b: ggml::Tensor, - - // weights for the model - layers: Vec, - - // must be kept alive for the model - context: ModelContext, -} - -unsafe impl Send for Bert {} -unsafe impl Sync for Bert {} - -/// BERT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct Hyperparameters { - /// Size of the model's vocabulary - pub n_vocab: usize, - - /// Maximum number of tokens - pub n_max_tokens: usize, - - /// Size of the model's embedding layer - pub n_embd: usize, - - /// n_head - pub n_intermediate: usize, - - /// Number of attention heads - pub n_head: usize, - - /// Number of layers in the model - pub n_layer: usize, - - /// file_type - pub file_type: FileType, -} - -impl KnownModel for Bert { - type Hyperparameters = Hyperparameters; - - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl TensorLoader, - ) -> Result { - let mut tl = tensor_loader; - - let word_embeddings = tl.load("embeddings.word_embeddings.weight")?; - let token_type_embeddings = tl.load("embeddings.token_type_embeddings.weight")?; - let position_embeddings = tl.load("embeddings.position_embeddings.weight")?; - - let ln_e_w = tl.load("embeddings.LayerNorm.weight")?; - let ln_e_b = tl.load("embeddings.LayerNorm.bias")?; - - let mut layers = Vec::new(); - - for i in 0..hyperparameters.n_layer { - let backend = params.backend(i); - - let layer = Layer { - ln_att_w: tl - .load(&format!( - "encoder.layer.{i}.attention.output.LayerNorm.weight" - ))? - .transfer_to(backend), - ln_att_b: tl - .load(&format!( - "encoder.layer.{i}.attention.output.LayerNorm.bias" - ))? - .transfer_to(backend), - - // attention - q_w: tl - .load(&format!("encoder.layer.{i}.attention.self.query.weight"))? - .transfer_to(backend), - q_b: tl - .load(&format!("encoder.layer.{i}.attention.self.query.bias"))? - .transfer_to(backend), - k_w: tl - .load(&format!("encoder.layer.{i}.attention.self.key.weight"))? - .transfer_to(backend), - k_b: tl - .load(&format!("encoder.layer.{i}.attention.self.key.bias"))? - .transfer_to(backend), - v_w: tl - .load(&format!("encoder.layer.{i}.attention.self.value.weight"))? - .transfer_to(backend), - v_b: tl - .load(&format!("encoder.layer.{i}.attention.self.value.bias"))? - .transfer_to(backend), - - o_w: tl - .load(&format!("encoder.layer.{i}.attention.output.dense.weight"))? - .transfer_to(backend), - o_b: tl - .load(&format!("encoder.layer.{i}.attention.output.dense.bias"))? - .transfer_to(backend), - - // ff - ff_i_w: tl - .load(&format!("encoder.layer.{i}.intermediate.dense.weight"))? - .transfer_to(backend), - ff_i_b: tl - .load(&format!("encoder.layer.{i}.intermediate.dense.bias"))? - .transfer_to(backend), - - ln_out_w: tl - .load(&format!("encoder.layer.{i}.output.LayerNorm.weight"))? - .transfer_to(backend), - ln_out_b: tl - .load(&format!("encoder.layer.{i}.output.LayerNorm.bias"))? - .transfer_to(backend), - ff_o_w: tl - .load(&format!("encoder.layer.{i}.output.dense.weight"))? - .transfer_to(backend), - ff_o_b: tl - .load(&format!("encoder.layer.{i}.output.dense.bias"))? - .transfer_to(backend), - }; - - layers.push(layer); - } - let context = tl.finish(); - - Ok(Self { - ln_e_b, - ln_e_w, - position_embeddings, - token_type_embeddings, - word_embeddings, - hyperparameters, - params, - tokenizer, - layers, - context, - }) - } - - /// Starts a new `InferenceSession` for this model. - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - InferenceSession::new( - config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, - ) - } - - #[tracing::instrument(level = "trace", skip_all)] - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - let input_len = input_tokens.len(); - let _ctx_size = self.params.context_size; - - let Hyperparameters { - n_vocab, - n_max_tokens: _, - n_embd, - n_intermediate: _, - n_head, - n_layer, - file_type: _, - } = self.hyperparameters; - - let d_head = n_embd / n_head; - - let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let mut ctx0 = builder.ctx0.borrow_mut(); - let gf = ctx0.create_compute_graph(); - - let embd = builder.embd; - - let mut input_layer = ctx0.op_get_rows(&self.word_embeddings, embd); - - // IL = word_embeddings + token_types + position_embeddingso - { - // token-types: a zero tensor - let mut token_types = ctx0.new_tensor_1d(llm_base::ElementType::I32, input_len); - token_types.zero_data(); - - // position embeddings: another tensor - let position_buf: Vec = (0..input_len as i32).collect(); - let mut positions = ctx0.new_tensor_1d(llm_base::ElementType::I32, input_len); - unsafe { positions.write_data(bytemuck::cast_slice(&position_buf)) }; - - // IL += token_types - input_layer = ctx0.op_add( - &input_layer, - &ctx0.op_get_rows(&self.token_type_embeddings, &token_types), - ); - - // IL += position_embeddings - input_layer = ctx0.op_add( - &input_layer, - &ctx0.op_get_rows(&self.position_embeddings, &positions), - ); - } - - // embd norm - { - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_e_w), &self.ln_e_b); - } - - for il in 0..n_layer { - ctx0.set_offloading(self.params.should_offload(il)); - - let mut current = input_layer.share(); - - // self-attention - { - let q_current = ctx0.op_reshape_3d( - &ctx0.op_add( - &ctx0.op_mul_mat(&self.layers[il].q_w, ¤t), - &self.layers[il].q_b, - ), - d_head, - n_head, - input_len, - ); - let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)); - - let k_current = ctx0.op_reshape_3d( - &ctx0.op_add( - &ctx0.op_mul_mat(&self.layers[il].k_w, ¤t), - &self.layers[il].k_b, - ), - d_head, - n_head, - input_len, - ); - let k = ctx0.op_permute(&k_current, (0, 2, 1, 3)); - - let v_current = ctx0.op_reshape_3d( - &ctx0.op_add( - &ctx0.op_mul_mat(&self.layers[il].v_w, ¤t), - &self.layers[il].v_b, - ), - d_head, - n_head, - input_len, - ); - let mut v = ctx0.op_permute(&v_current, (0, 2, 1, 3)); - - let mut kq = ctx0.op_mul_mat(&k, &q); - - // TODO: look into op_scale_inplace and op_soft_max_inplace - kq = ctx0.op_scale( - &kq, - &ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())), - ); - kq = ctx0.op_soft_max(&kq); - - v = ctx0.op_cont(&ctx0.op_transpose(&v)); - - let mut kqv = ctx0.op_mul_mat(&v, &kq); - kqv = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - - current = ctx0.op_cpy( - &kqv, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - } - - // attention output - current = ctx0.op_add( - &ctx0.op_mul_mat(&self.layers[il].o_w, ¤t), - &self.layers[il].o_b, - ); - - // re-add the layer input - current = ctx0.op_add(¤t, &input_layer); - - // attention norm - { - current = ctx0.op_norm(¤t); - current = ctx0.op_add( - &ctx0.op_mul(¤t, &self.layers[il].ln_att_w), - &self.layers[il].ln_att_b, - ); - } - - let att_output = current.share(); - - // intermediate output - current = ctx0.op_mul_mat(&self.layers[il].ff_i_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].ff_i_b); - current = ctx0.op_gelu(¤t); - - // layer output - current = ctx0.op_mul_mat(&self.layers[il].ff_o_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].ff_o_b); - - // attentions bypass the intermediate layer - current = ctx0.op_add(&att_output, ¤t); - - // output norm - { - current = ctx0.op_norm(¤t); - current = ctx0.op_add( - &ctx0.op_mul(¤t, &self.layers[il].ln_out_w), - &self.layers[il].ln_out_b, - ); - } - - // input for next layer - input_layer = current; - } - input_layer = ctx0.op_cont(&ctx0.op_transpose(&input_layer)); - - ctx0.set_offloading(false); - // pooler - let mut sum = ctx0.new_tensor_2d(llm_base::ElementType::F32, input_len, 1); - sum = ctx0.set_f32(&sum, 1.0 / (input_len as f32)); - input_layer = ctx0.op_mul_mat(&input_layer, &sum); - - // normalizer - let length = ctx0.op_sqrt(&ctx0.op_sum(&ctx0.op_sqr(&input_layer))); - - input_layer = ctx0.op_scale(&input_layer, &ctx0.op_div(&ctx0.new_f32(1.0), &length)); - - ( - gf, - GraphOutputs { - result: input_layer.share(), - embedding_result: input_layer.share(), - output_length: input_len, - }, - ) - }); - - // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, input_len); - common::extract_logits(output_request, &outputs.result, n_vocab, input_len); - common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, 1); - } - - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size - } - - fn bot_token_id(&self) -> Option { - self.tokenizer.id("[PAD]".as_bytes()) - } - - fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("".as_bytes()).unwrap_or(2) - } - - fn quantize_tensors() -> Vec { - vec![Regex::new(".*weight").unwrap()] - } - - fn skip_quantize_tensors() -> Vec { - vec![] - } - - fn supports_rewind(&self) -> bool { - true - } -} - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - Ok(Hyperparameters { - n_vocab: util::read_i32(reader)?.try_into()?, - n_max_tokens: util::read_i32(reader)?.try_into()?, - n_embd: util::read_i32(reader)?.try_into()?, - n_intermediate: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - file_type: util::read_filetype(reader)?, - }) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_max_tokens.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_intermediate.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.file_type.into())?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } -} - -struct Layer { - // normalization - ln_att_w: ggml::Tensor, - ln_att_b: ggml::Tensor, - - ln_out_w: ggml::Tensor, - ln_out_b: ggml::Tensor, - - // attention - q_w: ggml::Tensor, - q_b: ggml::Tensor, - k_w: ggml::Tensor, - k_b: ggml::Tensor, - v_w: ggml::Tensor, - v_b: ggml::Tensor, - - o_w: ggml::Tensor, - o_b: ggml::Tensor, - - // ff - ff_i_w: ggml::Tensor, - ff_i_b: ggml::Tensor, - - ff_o_w: ggml::Tensor, - ff_o_b: ggml::Tensor, -} +// //! An implementation of [BERT](https://huggingface.co/docs/transformers/model_doc/bert) for the `llm` ecosystem. +// #![deny(missing_docs)] + +// use std::error::Error; + +// use llm_base::{ +// ggml, +// model::{common, HyperparametersWriteError}, +// util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, LoadError, Model, +// ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, +// }; + +// /// The BERT model. +// /// +// /// # Safety +// /// This implements [Send] and [Sync] as it is immutable after construction. +// pub struct Bert { +// params: ModelParameters, +// hyperparameters: Hyperparameters, +// tokenizer: Tokenizer, + +// word_embeddings: ggml::Tensor, +// token_type_embeddings: ggml::Tensor, +// position_embeddings: ggml::Tensor, +// ln_e_w: ggml::Tensor, +// ln_e_b: ggml::Tensor, + +// // weights for the model +// layers: Vec, + +// // must be kept alive for the model +// context: ModelContext, +// } + +// unsafe impl Send for Bert {} +// unsafe impl Sync for Bert {} + +// /// BERT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +// #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] +// pub struct Hyperparameters { +// /// Size of the model's vocabulary +// pub n_vocab: usize, + +// /// Maximum number of tokens +// pub n_max_tokens: usize, + +// /// Size of the model's embedding layer +// pub n_embd: usize, + +// /// n_head +// pub n_intermediate: usize, + +// /// Number of attention heads +// pub n_head: usize, + +// /// Number of layers in the model +// pub n_layer: usize, + +// /// file_type +// pub file_type: FileType, +// } + +// impl Model for Bert { +// type Hyperparameters = Hyperparameters; + +// fn new( +// hyperparameters: Self::Hyperparameters, +// params: ModelParameters, +// tokenizer: Tokenizer, +// tensor_loader: impl TensorLoader, +// ) -> Result { +// let mut tl = tensor_loader; + +// let word_embeddings = tl.load("embeddings.word_embeddings.weight")?; +// let token_type_embeddings = tl.load("embeddings.token_type_embeddings.weight")?; +// let position_embeddings = tl.load("embeddings.position_embeddings.weight")?; + +// let ln_e_w = tl.load("embeddings.LayerNorm.weight")?; +// let ln_e_b = tl.load("embeddings.LayerNorm.bias")?; + +// let mut layers = Vec::new(); + +// for i in 0..hyperparameters.n_layer { +// let backend = params.backend(i); + +// let layer = Layer { +// ln_att_w: tl +// .load(&format!( +// "encoder.layer.{i}.attention.output.LayerNorm.weight" +// ))? +// .transfer_to(backend), +// ln_att_b: tl +// .load(&format!( +// "encoder.layer.{i}.attention.output.LayerNorm.bias" +// ))? +// .transfer_to(backend), + +// // attention +// q_w: tl +// .load(&format!("encoder.layer.{i}.attention.self.query.weight"))? +// .transfer_to(backend), +// q_b: tl +// .load(&format!("encoder.layer.{i}.attention.self.query.bias"))? +// .transfer_to(backend), +// k_w: tl +// .load(&format!("encoder.layer.{i}.attention.self.key.weight"))? +// .transfer_to(backend), +// k_b: tl +// .load(&format!("encoder.layer.{i}.attention.self.key.bias"))? +// .transfer_to(backend), +// v_w: tl +// .load(&format!("encoder.layer.{i}.attention.self.value.weight"))? +// .transfer_to(backend), +// v_b: tl +// .load(&format!("encoder.layer.{i}.attention.self.value.bias"))? +// .transfer_to(backend), + +// o_w: tl +// .load(&format!("encoder.layer.{i}.attention.output.dense.weight"))? +// .transfer_to(backend), +// o_b: tl +// .load(&format!("encoder.layer.{i}.attention.output.dense.bias"))? +// .transfer_to(backend), + +// // ff +// ff_i_w: tl +// .load(&format!("encoder.layer.{i}.intermediate.dense.weight"))? +// .transfer_to(backend), +// ff_i_b: tl +// .load(&format!("encoder.layer.{i}.intermediate.dense.bias"))? +// .transfer_to(backend), + +// ln_out_w: tl +// .load(&format!("encoder.layer.{i}.output.LayerNorm.weight"))? +// .transfer_to(backend), +// ln_out_b: tl +// .load(&format!("encoder.layer.{i}.output.LayerNorm.bias"))? +// .transfer_to(backend), +// ff_o_w: tl +// .load(&format!("encoder.layer.{i}.output.dense.weight"))? +// .transfer_to(backend), +// ff_o_b: tl +// .load(&format!("encoder.layer.{i}.output.dense.bias"))? +// .transfer_to(backend), +// }; + +// layers.push(layer); +// } +// let context = tl.finish(); + +// Ok(Self { +// ln_e_b, +// ln_e_w, +// position_embeddings, +// token_type_embeddings, +// word_embeddings, +// hyperparameters, +// params, +// tokenizer, +// layers, +// context, +// }) +// } + +// /// Starts a new `InferenceSession` for this model. +// fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { +// InferenceSession::new( +// config, +// &self.params, +// self.hyperparameters.n_layer, +// self.hyperparameters.n_embd, +// self.hyperparameters.n_vocab, +// ) +// } + +// #[tracing::instrument(level = "trace", skip_all)] +// fn evaluate( +// &self, +// session: &mut InferenceSession, +// input_tokens: &[TokenId], +// output_request: &mut OutputRequest, +// ) { +// let input_len = input_tokens.len(); +// let _ctx_size = self.params.context_size; + +// let Hyperparameters { +// n_vocab, +// n_max_tokens: _, +// n_embd, +// n_intermediate: _, +// n_head, +// n_layer, +// file_type: _, +// } = self.hyperparameters; + +// let d_head = n_embd / n_head; + +// let outputs = session.compute(self.context.clone(), input_tokens, |builder| { +// let mut ctx0 = builder.ctx0.borrow_mut(); +// let gf = ctx0.create_compute_graph(); + +// let embd = builder.embd; + +// let mut input_layer = ctx0.op_get_rows(&self.word_embeddings, embd); + +// // IL = word_embeddings + token_types + position_embeddingso +// { +// // token-types: a zero tensor +// let mut token_types = ctx0.new_tensor_1d(llm_base::ElementType::I32, input_len); +// token_types.zero_data(); + +// // position embeddings: another tensor +// let position_buf: Vec = (0..input_len as i32).collect(); +// let mut positions = ctx0.new_tensor_1d(llm_base::ElementType::I32, input_len); +// unsafe { positions.write_data(bytemuck::cast_slice(&position_buf)) }; + +// // IL += token_types +// input_layer = ctx0.op_add( +// &input_layer, +// &ctx0.op_get_rows(&self.token_type_embeddings, &token_types), +// ); + +// // IL += position_embeddings +// input_layer = ctx0.op_add( +// &input_layer, +// &ctx0.op_get_rows(&self.position_embeddings, &positions), +// ); +// } + +// // embd norm +// { +// input_layer = ctx0.op_norm(&input_layer); +// input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_e_w), &self.ln_e_b); +// } + +// for il in 0..n_layer { +// ctx0.set_offloading(self.params.should_offload(il)); + +// let mut current = input_layer.share(); + +// // self-attention +// { +// let q_current = ctx0.op_reshape_3d( +// &ctx0.op_add( +// &ctx0.op_mul_mat(&self.layers[il].q_w, ¤t), +// &self.layers[il].q_b, +// ), +// d_head, +// n_head, +// input_len, +// ); +// let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)); + +// let k_current = ctx0.op_reshape_3d( +// &ctx0.op_add( +// &ctx0.op_mul_mat(&self.layers[il].k_w, ¤t), +// &self.layers[il].k_b, +// ), +// d_head, +// n_head, +// input_len, +// ); +// let k = ctx0.op_permute(&k_current, (0, 2, 1, 3)); + +// let v_current = ctx0.op_reshape_3d( +// &ctx0.op_add( +// &ctx0.op_mul_mat(&self.layers[il].v_w, ¤t), +// &self.layers[il].v_b, +// ), +// d_head, +// n_head, +// input_len, +// ); +// let mut v = ctx0.op_permute(&v_current, (0, 2, 1, 3)); + +// let mut kq = ctx0.op_mul_mat(&k, &q); + +// // TODO: look into op_scale_inplace and op_soft_max_inplace +// kq = ctx0.op_scale( +// &kq, +// &ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())), +// ); +// kq = ctx0.op_soft_max(&kq); + +// v = ctx0.op_cont(&ctx0.op_transpose(&v)); + +// let mut kqv = ctx0.op_mul_mat(&v, &kq); +// kqv = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + +// current = ctx0.op_cpy( +// &kqv, +// &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), +// ); +// } + +// // attention output +// current = ctx0.op_add( +// &ctx0.op_mul_mat(&self.layers[il].o_w, ¤t), +// &self.layers[il].o_b, +// ); + +// // re-add the layer input +// current = ctx0.op_add(¤t, &input_layer); + +// // attention norm +// { +// current = ctx0.op_norm(¤t); +// current = ctx0.op_add( +// &ctx0.op_mul(¤t, &self.layers[il].ln_att_w), +// &self.layers[il].ln_att_b, +// ); +// } + +// let att_output = current.share(); + +// // intermediate output +// current = ctx0.op_mul_mat(&self.layers[il].ff_i_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].ff_i_b); +// current = ctx0.op_gelu(¤t); + +// // layer output +// current = ctx0.op_mul_mat(&self.layers[il].ff_o_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].ff_o_b); + +// // attentions bypass the intermediate layer +// current = ctx0.op_add(&att_output, ¤t); + +// // output norm +// { +// current = ctx0.op_norm(¤t); +// current = ctx0.op_add( +// &ctx0.op_mul(¤t, &self.layers[il].ln_out_w), +// &self.layers[il].ln_out_b, +// ); +// } + +// // input for next layer +// input_layer = current; +// } +// input_layer = ctx0.op_cont(&ctx0.op_transpose(&input_layer)); + +// ctx0.set_offloading(false); +// // pooler +// let mut sum = ctx0.new_tensor_2d(llm_base::ElementType::F32, input_len, 1); +// sum = ctx0.set_f32(&sum, 1.0 / (input_len as f32)); +// input_layer = ctx0.op_mul_mat(&input_layer, &sum); + +// // normalizer +// let length = ctx0.op_sqrt(&ctx0.op_sum(&ctx0.op_sqr(&input_layer))); + +// input_layer = ctx0.op_scale(&input_layer, &ctx0.op_div(&ctx0.new_f32(1.0), &length)); + +// ( +// gf, +// GraphOutputs { +// result: input_layer.share(), +// embedding_result: input_layer.share(), +// output_length: input_len, +// }, +// ) +// }); + +// // finish evaluation +// common::read_last_token(session, &outputs.result, n_vocab, input_len); +// common::extract_logits(output_request, &outputs.result, n_vocab, input_len); +// common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, 1); +// } + +// fn hyperparameters(&self) -> &Self::Hyperparameters { +// &self.hyperparameters +// } + +// fn tokenizer(&self) -> &Tokenizer { +// &self.tokenizer +// } + +// fn context_size(&self) -> usize { +// self.params.context_size +// } + +// fn bot_token_id(&self) -> Option { +// self.tokenizer.id("[PAD]".as_bytes()) +// } + +// fn eot_token_id(&self) -> TokenId { +// self.tokenizer.id("".as_bytes()).unwrap_or(2) +// } + +// fn quantize_tensors() -> Vec { +// vec![Regex::new(".*weight").unwrap()] +// } + +// fn skip_quantize_tensors() -> Vec { +// vec![] +// } + +// fn supports_rewind(&self) -> bool { +// true +// } +// } + +// impl llm_base::Hyperparameters for Hyperparameters { +// fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { +// Ok(Hyperparameters { +// n_vocab: util::read_i32(reader)?.try_into()?, +// n_max_tokens: util::read_i32(reader)?.try_into()?, +// n_embd: util::read_i32(reader)?.try_into()?, +// n_intermediate: util::read_i32(reader)?.try_into()?, +// n_head: util::read_i32(reader)?.try_into()?, +// n_layer: util::read_i32(reader)?.try_into()?, +// file_type: util::read_filetype(reader)?, +// }) +// } + +// fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// util::write_i32(writer, self.n_max_tokens.try_into()?)?; +// util::write_i32(writer, self.n_embd.try_into()?)?; +// util::write_i32(writer, self.n_intermediate.try_into()?)?; +// util::write_i32(writer, self.n_head.try_into()?)?; +// util::write_i32(writer, self.n_layer.try_into()?)?; +// util::write_i32(writer, self.file_type.into())?; +// Ok(()) +// } + +// fn n_vocabulary(&self) -> usize { +// self.n_vocab +// } + +// fn file_type(&self) -> Option { +// Some(self.file_type) +// } + +// fn file_type_mut(&mut self) -> Option<&mut FileType> { +// Some(&mut self.file_type) +// } +// } + +// struct Layer { +// // normalization +// ln_att_w: ggml::Tensor, +// ln_att_b: ggml::Tensor, + +// ln_out_w: ggml::Tensor, +// ln_out_b: ggml::Tensor, + +// // attention +// q_w: ggml::Tensor, +// q_b: ggml::Tensor, +// k_w: ggml::Tensor, +// k_b: ggml::Tensor, +// v_w: ggml::Tensor, +// v_b: ggml::Tensor, + +// o_w: ggml::Tensor, +// o_b: ggml::Tensor, + +// // ff +// ff_i_w: ggml::Tensor, +// ff_i_b: ggml::Tensor, + +// ff_o_w: ggml::Tensor, +// ff_o_b: ggml::Tensor, +// } diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index fb26ff3d..41ce7262 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -1,455 +1,455 @@ -//! An implementation of [BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom) -//! for the `llm` ecosystem. -#![deny(missing_docs)] - -use llm_base::{ - ggml, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, - ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, -}; - -/// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom) -/// -/// # Safety -/// This implements [Send] and [Sync] as it is immutable after construction. -pub struct Bloom { - params: ModelParameters, - - hyperparameters: Hyperparameters, - tokenizer: Tokenizer, - - // model-global weights - // weighted token embeddings - wte: ggml::Tensor, - // normalization weight & bias - norm: ggml::Tensor, - norm_bias: ggml::Tensor, - // output normalization weight & bias - output_norm: ggml::Tensor, - output_norm_bias: ggml::Tensor, - // output weight - output: ggml::Tensor, - - // weights for the model - layers: Vec, - - // must be kept alive for the model - context: ModelContext, -} - -unsafe impl Send for Bloom {} -unsafe impl Sync for Bloom {} - -impl KnownModel for Bloom { - type Hyperparameters = Hyperparameters; - - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl llm_base::TensorLoader, - ) -> Result { - let mut tl = tensor_loader; - - // model-global weights - let wte = tl.load("tok_embeddings.weight")?; - let norm = tl.load("norm.weight")?; - let norm_bias = tl.load("norm.bias")?; - let output_norm = tl.load("output_norm.weight")?; - let output_norm_bias = tl.load("output_norm.bias")?; - let output = tl.load("output.weight")?; - - let mut layers = Vec::new(); - for i in 0..hyperparameters.n_layer { - let layer = Layer { - attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"))?, - attention_norm_b: tl.load(&format!("layers.{i}.attention_norm.bias"))?, - - query_key_value: tl - .load(&format!("layers.{i}.attention.query_key_value.weight"))?, - query_key_value_b: tl - .load(&format!("layers.{i}.attention.query_key_value.bias"))?, - - wo: tl.load(&format!("layers.{i}.attention.wo.weight"))?, - wo_b: tl.load(&format!("layers.{i}.attention.wo.bias"))?, - - ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"))?, - ffn_norm_b: tl.load(&format!("layers.{i}.ffn_norm.bias"))?, - - w1: tl.load(&format!("layers.{i}.feed_forward.w1.weight"))?, - w1_b: tl.load(&format!("layers.{i}.feed_forward.w1.bias"))?, - w2: tl.load(&format!("layers.{i}.feed_forward.w2.weight"))?, - w2_b: tl.load(&format!("layers.{i}.feed_forward.w2.bias"))?, - }; - - layers.push(layer); - } - - let context = tl.finish(); - - Ok(Bloom { - hyperparameters, - params, - tokenizer, - wte, - norm, - norm_bias, - output_norm, - output_norm_bias, - output, - layers, - context, - }) - } - - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - InferenceSession::new( - config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, - ) - } - - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - let ctx_size = self.params.context_size; - - let Hyperparameters { - n_vocab, - n_embd, - n_mult: _, - n_head, - n_layer, - file_type: _, - } = self.hyperparameters; - - let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let session_len = builder.n_past; - let input_len = builder.input_length(); - let ctx0 = builder.ctx0.borrow(); - let (memory_k_size, memory_v_size) = ( - builder.memory_k.element_size(), - builder.memory_v.element_size(), - ); - let embd = &builder.embd; - let mut input_layer = ctx0.op_get_rows(&self.wte, embd); - - // normalize embeddings - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_mul(&input_layer, &self.norm); - input_layer = ctx0.op_add(&input_layer, &self.norm_bias); - - let mut gf = ctx0.create_compute_graph(); - for il in 0..n_layer { - let input_self_attention = input_layer.share(); - let mut current: ggml::Tensor; - - // norm - current = ctx0.op_norm(&input_layer); - - // cur = attention_norm * cur - current = ctx0.op_mul(¤t, &self.layers[il].attention_norm); - current = ctx0.op_add(¤t, &self.layers[il].attention_norm_b); - - //attention - current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].query_key_value_b); - - // self-attention - let nb = current.get_nb()[1]; - let q_current = ctx0.op_view_2d( - ¤t, - (n_embd, input_len), - nb, - //0 * std::mem::size_of::() * n_embd as usize, - 0, - ); - let k_current = ctx0.op_view_2d( - ¤t, - (n_embd, input_len), - nb, - std::mem::size_of::() * n_embd, - ); - let v_current = ctx0.op_view_2d( - ¤t, - (n_embd, input_len), - nb, - 2 * std::mem::size_of::() * n_embd, - ); - - // store key and value to memory - if input_len >= 1 { - let k = ctx0.op_view_1d( - builder.memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - - let v = ctx0.op_view_1d( - builder.memory_v, - input_len * n_embd, - (memory_v_size * n_embd) * (il * ctx_size + session_len), - ); - - gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - let big_q = ctx0.op_permute( - &ctx0.op_cpy( - &q_current, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), - ), - (0, 2, 1, 3), - ); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - let big_k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, - ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - - // K * Q - let k_q = ctx0.op_mul_mat(&big_k, &big_q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - let k_q_scaled = ctx0.op_scale( - &k_q, - &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - //alibi - // KQ_scaled_alibi = KQ_scaled + alibi_bias - let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, session_len, n_head, 8f32); - - // KQ_masked = mask_past(KQ_scaled) - let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, session_len); - - // KQ = soft_max(KQ_masked) - let k_q_soft_max = ctx0.op_soft_max(&k_q_masked); - - let memv_elsize = memory_v_size; - - let v_trans = ctx0.op_cpy( - &ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_v, - (session_len + input_len) * n_embd, - il * ctx_size * memv_elsize * n_embd, - ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (1, 2, 0, 3), - ), - &ctx0.new_tensor_3d( - builder.memory_v.get_type(), - session_len + input_len, - n_embd / n_head, - n_head, - ), - ); - - let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); - - // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy( - &k_q_v_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - - // projection - current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].wo_b); - - let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); - - // feed-forward network - // norm - current = ctx0.op_norm(&input_feed_forward); - - // cur = ffn_norm*cur + ffn_norm_b - current = ctx0.op_mul(¤t, &self.layers[il].ffn_norm); - - current = ctx0.op_add(¤t, &self.layers[il].ffn_norm_b); - - current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); - - current = ctx0.op_add(¤t, &self.layers[il].w1_b); - - // SILU activation - - current = ctx0.op_gelu(¤t); - - current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); - - current = ctx0.op_add(¤t, &self.layers[il].w2_b); - - current = ctx0.op_add(¤t, &input_feed_forward); - - // input for next layer - input_layer = current; - } - - // norm - input_layer = ctx0.op_norm(&input_layer); - - // inpL = norm*inpL - input_layer = ctx0.op_mul(&input_layer, &self.output_norm); - - input_layer = ctx0.op_add(&input_layer, &self.output_norm_bias); - - let embeddings_tensor: ggml::Tensor = input_layer.share(); - - // lm_head - input_layer = ctx0.op_mul_mat(&self.output, &input_layer); - - ( - gf, - GraphOutputs { - result: input_layer, - embedding_result: embeddings_tensor, - output_length: input_len, - }, - ) - }); - - // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); - common::extract_logits( - output_request, - &outputs.result, - n_vocab, - outputs.output_length, - ); - common::extract_embeddings( - output_request, - &outputs.embedding_result, - n_embd, - outputs.output_length, - ); - } - - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size - } - - fn bot_token_id(&self) -> Option { - self.tokenizer.id("".as_bytes()) - } - - fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("".as_bytes()).unwrap() - } - - fn quantize_tensors() -> Vec { - vec![Regex::new(".*weight").unwrap()] - } - - fn skip_quantize_tensors() -> Vec { - vec![] - } - - fn supports_rewind(&self) -> bool { - true - } -} - -/// BLOOM [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct Hyperparameters { - /// Size of the model's vocabulary - pub n_vocab: usize, - /// Size of the model's embedding layer - pub n_embd: usize, - /// n_mult - pub n_mult: usize, - /// n_head - pub n_head: usize, - /// Number of layers in the model - pub n_layer: usize, - /// file_type - pub file_type: FileType, -} - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - Ok(Hyperparameters { - n_vocab: util::read_i32(reader)?.try_into()?, - n_embd: util::read_i32(reader)?.try_into()?, - n_mult: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - file_type: util::read_filetype(reader)?, - }) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_mult.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.file_type.into())?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } -} - -struct Layer { - pub attention_norm: ggml::Tensor, - pub attention_norm_b: ggml::Tensor, - pub wo: ggml::Tensor, - pub wo_b: ggml::Tensor, - pub query_key_value: ggml::Tensor, - pub query_key_value_b: ggml::Tensor, - // normalization - pub ffn_norm: ggml::Tensor, - pub ffn_norm_b: ggml::Tensor, - // ff - pub w1: ggml::Tensor, - pub w1_b: ggml::Tensor, - pub w2: ggml::Tensor, - pub w2_b: ggml::Tensor, -} +// //! An implementation of [BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom) +// //! for the `llm` ecosystem. +// #![deny(missing_docs)] + +// use llm_base::{ +// ggml, +// model::{common, HyperparametersWriteError}, +// util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, +// ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, +// }; + +// /// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom) +// /// +// /// # Safety +// /// This implements [Send] and [Sync] as it is immutable after construction. +// pub struct Bloom { +// params: ModelParameters, + +// hyperparameters: Hyperparameters, +// tokenizer: Tokenizer, + +// // model-global weights +// // weighted token embeddings +// wte: ggml::Tensor, +// // normalization weight & bias +// norm: ggml::Tensor, +// norm_bias: ggml::Tensor, +// // output normalization weight & bias +// output_norm: ggml::Tensor, +// output_norm_bias: ggml::Tensor, +// // output weight +// output: ggml::Tensor, + +// // weights for the model +// layers: Vec, + +// // must be kept alive for the model +// context: ModelContext, +// } + +// unsafe impl Send for Bloom {} +// unsafe impl Sync for Bloom {} + +// impl Model for Bloom { +// type Hyperparameters = Hyperparameters; + +// fn new( +// hyperparameters: Self::Hyperparameters, +// params: ModelParameters, +// tokenizer: Tokenizer, +// tensor_loader: impl llm_base::TensorLoader, +// ) -> Result { +// let mut tl = tensor_loader; + +// // model-global weights +// let wte = tl.load("tok_embeddings.weight")?; +// let norm = tl.load("norm.weight")?; +// let norm_bias = tl.load("norm.bias")?; +// let output_norm = tl.load("output_norm.weight")?; +// let output_norm_bias = tl.load("output_norm.bias")?; +// let output = tl.load("output.weight")?; + +// let mut layers = Vec::new(); +// for i in 0..hyperparameters.n_layer { +// let layer = Layer { +// attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"))?, +// attention_norm_b: tl.load(&format!("layers.{i}.attention_norm.bias"))?, + +// query_key_value: tl +// .load(&format!("layers.{i}.attention.query_key_value.weight"))?, +// query_key_value_b: tl +// .load(&format!("layers.{i}.attention.query_key_value.bias"))?, + +// wo: tl.load(&format!("layers.{i}.attention.wo.weight"))?, +// wo_b: tl.load(&format!("layers.{i}.attention.wo.bias"))?, + +// ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"))?, +// ffn_norm_b: tl.load(&format!("layers.{i}.ffn_norm.bias"))?, + +// w1: tl.load(&format!("layers.{i}.feed_forward.w1.weight"))?, +// w1_b: tl.load(&format!("layers.{i}.feed_forward.w1.bias"))?, +// w2: tl.load(&format!("layers.{i}.feed_forward.w2.weight"))?, +// w2_b: tl.load(&format!("layers.{i}.feed_forward.w2.bias"))?, +// }; + +// layers.push(layer); +// } + +// let context = tl.finish(); + +// Ok(Bloom { +// hyperparameters, +// params, +// tokenizer, +// wte, +// norm, +// norm_bias, +// output_norm, +// output_norm_bias, +// output, +// layers, +// context, +// }) +// } + +// fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { +// InferenceSession::new( +// config, +// &self.params, +// self.hyperparameters.n_layer, +// self.hyperparameters.n_embd, +// self.hyperparameters.n_vocab, +// ) +// } + +// fn evaluate( +// &self, +// session: &mut InferenceSession, +// input_tokens: &[TokenId], +// output_request: &mut OutputRequest, +// ) { +// let ctx_size = self.params.context_size; + +// let Hyperparameters { +// n_vocab, +// n_embd, +// n_mult: _, +// n_head, +// n_layer, +// file_type: _, +// } = self.hyperparameters; + +// let outputs = session.compute(self.context.clone(), input_tokens, |builder| { +// let session_len = builder.n_past; +// let input_len = builder.input_length(); +// let ctx0 = builder.ctx0.borrow(); +// let (memory_k_size, memory_v_size) = ( +// builder.memory_k.element_size(), +// builder.memory_v.element_size(), +// ); +// let embd = &builder.embd; +// let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + +// // normalize embeddings +// input_layer = ctx0.op_norm(&input_layer); +// input_layer = ctx0.op_mul(&input_layer, &self.norm); +// input_layer = ctx0.op_add(&input_layer, &self.norm_bias); + +// let mut gf = ctx0.create_compute_graph(); +// for il in 0..n_layer { +// let input_self_attention = input_layer.share(); +// let mut current: ggml::Tensor; + +// // norm +// current = ctx0.op_norm(&input_layer); + +// // cur = attention_norm * cur +// current = ctx0.op_mul(¤t, &self.layers[il].attention_norm); +// current = ctx0.op_add(¤t, &self.layers[il].attention_norm_b); + +// //attention +// current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].query_key_value_b); + +// // self-attention +// let nb = current.get_nb()[1]; +// let q_current = ctx0.op_view_2d( +// ¤t, +// (n_embd, input_len), +// nb, +// //0 * std::mem::size_of::() * n_embd as usize, +// 0, +// ); +// let k_current = ctx0.op_view_2d( +// ¤t, +// (n_embd, input_len), +// nb, +// std::mem::size_of::() * n_embd, +// ); +// let v_current = ctx0.op_view_2d( +// ¤t, +// (n_embd, input_len), +// nb, +// 2 * std::mem::size_of::() * n_embd, +// ); + +// // store key and value to memory +// if input_len >= 1 { +// let k = ctx0.op_view_1d( +// builder.memory_k, +// input_len * n_embd, +// (memory_k_size * n_embd) * (il * ctx_size + session_len), +// ); + +// let v = ctx0.op_view_1d( +// builder.memory_v, +// input_len * n_embd, +// (memory_v_size * n_embd) * (il * ctx_size + session_len), +// ); + +// gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); +// gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); +// } + +// // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) +// let big_q = ctx0.op_permute( +// &ctx0.op_cpy( +// &q_current, +// &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), +// ), +// (0, 2, 1, 3), +// ); + +// // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) +// let big_k = ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_k, +// (session_len + input_len) * n_embd, +// il * ctx_size * memory_k_size * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + input_len, +// ), +// (0, 2, 1, 3), +// ); + +// // K * Q +// let k_q = ctx0.op_mul_mat(&big_k, &big_q); + +// // KQ_scaled = KQ / sqrt(n_embd/n_head) +// let k_q_scaled = ctx0.op_scale( +// &k_q, +// &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)), +// ); + +// //alibi +// // KQ_scaled_alibi = KQ_scaled + alibi_bias +// let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, session_len, n_head, 8f32); + +// // KQ_masked = mask_past(KQ_scaled) +// let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, session_len); + +// // KQ = soft_max(KQ_masked) +// let k_q_soft_max = ctx0.op_soft_max(&k_q_masked); + +// let memv_elsize = memory_v_size; + +// let v_trans = ctx0.op_cpy( +// &ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_v, +// (session_len + input_len) * n_embd, +// il * ctx_size * memv_elsize * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + input_len, +// ), +// (1, 2, 0, 3), +// ), +// &ctx0.new_tensor_3d( +// builder.memory_v.get_type(), +// session_len + input_len, +// n_embd / n_head, +// n_head, +// ), +// ); + +// let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max); + +// // KQV_merged = KQV.permute(0, 2, 1, 3) +// let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); + +// // cur = KQV_merged.contiguous().view(n_embd, N) +// current = ctx0.op_cpy( +// &k_q_v_merged, +// &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), +// ); + +// // projection +// current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].wo_b); + +// let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); + +// // feed-forward network +// // norm +// current = ctx0.op_norm(&input_feed_forward); + +// // cur = ffn_norm*cur + ffn_norm_b +// current = ctx0.op_mul(¤t, &self.layers[il].ffn_norm); + +// current = ctx0.op_add(¤t, &self.layers[il].ffn_norm_b); + +// current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); + +// current = ctx0.op_add(¤t, &self.layers[il].w1_b); + +// // SILU activation + +// current = ctx0.op_gelu(¤t); + +// current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); + +// current = ctx0.op_add(¤t, &self.layers[il].w2_b); + +// current = ctx0.op_add(¤t, &input_feed_forward); + +// // input for next layer +// input_layer = current; +// } + +// // norm +// input_layer = ctx0.op_norm(&input_layer); + +// // inpL = norm*inpL +// input_layer = ctx0.op_mul(&input_layer, &self.output_norm); + +// input_layer = ctx0.op_add(&input_layer, &self.output_norm_bias); + +// let embeddings_tensor: ggml::Tensor = input_layer.share(); + +// // lm_head +// input_layer = ctx0.op_mul_mat(&self.output, &input_layer); + +// ( +// gf, +// GraphOutputs { +// result: input_layer, +// embedding_result: embeddings_tensor, +// output_length: input_len, +// }, +// ) +// }); + +// // finish evaluation +// common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); +// common::extract_logits( +// output_request, +// &outputs.result, +// n_vocab, +// outputs.output_length, +// ); +// common::extract_embeddings( +// output_request, +// &outputs.embedding_result, +// n_embd, +// outputs.output_length, +// ); +// } + +// fn hyperparameters(&self) -> &Self::Hyperparameters { +// &self.hyperparameters +// } + +// fn tokenizer(&self) -> &Tokenizer { +// &self.tokenizer +// } + +// fn context_size(&self) -> usize { +// self.params.context_size +// } + +// fn bot_token_id(&self) -> Option { +// self.tokenizer.id("".as_bytes()) +// } + +// fn eot_token_id(&self) -> TokenId { +// self.tokenizer.id("".as_bytes()).unwrap() +// } + +// fn quantize_tensors() -> Vec { +// vec![Regex::new(".*weight").unwrap()] +// } + +// fn skip_quantize_tensors() -> Vec { +// vec![] +// } + +// fn supports_rewind(&self) -> bool { +// true +// } +// } + +// /// BLOOM [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +// #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] +// pub struct Hyperparameters { +// /// Size of the model's vocabulary +// pub n_vocab: usize, +// /// Size of the model's embedding layer +// pub n_embd: usize, +// /// n_mult +// pub n_mult: usize, +// /// n_head +// pub n_head: usize, +// /// Number of layers in the model +// pub n_layer: usize, +// /// file_type +// pub file_type: FileType, +// } + +// impl llm_base::Hyperparameters for Hyperparameters { +// fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { +// Ok(Hyperparameters { +// n_vocab: util::read_i32(reader)?.try_into()?, +// n_embd: util::read_i32(reader)?.try_into()?, +// n_mult: util::read_i32(reader)?.try_into()?, +// n_head: util::read_i32(reader)?.try_into()?, +// n_layer: util::read_i32(reader)?.try_into()?, +// file_type: util::read_filetype(reader)?, +// }) +// } + +// fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// util::write_i32(writer, self.n_embd.try_into()?)?; +// util::write_i32(writer, self.n_mult.try_into()?)?; +// util::write_i32(writer, self.n_head.try_into()?)?; +// util::write_i32(writer, self.n_layer.try_into()?)?; +// util::write_i32(writer, self.file_type.into())?; +// Ok(()) +// } + +// fn n_vocabulary(&self) -> usize { +// self.n_vocab +// } + +// fn file_type(&self) -> Option { +// Some(self.file_type) +// } + +// fn file_type_mut(&mut self) -> Option<&mut FileType> { +// Some(&mut self.file_type) +// } +// } + +// struct Layer { +// pub attention_norm: ggml::Tensor, +// pub attention_norm_b: ggml::Tensor, +// pub wo: ggml::Tensor, +// pub wo_b: ggml::Tensor, +// pub query_key_value: ggml::Tensor, +// pub query_key_value_b: ggml::Tensor, +// // normalization +// pub ffn_norm: ggml::Tensor, +// pub ffn_norm_b: ggml::Tensor, +// // ff +// pub w1: ggml::Tensor, +// pub w1_b: ggml::Tensor, +// pub w2: ggml::Tensor, +// pub w2_b: ggml::Tensor, +// } diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index f9f6c5d7..e4f570d6 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -1,476 +1,476 @@ -//! An implementation of the [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem. -//! -//! This implementation only works for Falcon 7B, and with 32-bit memory tensors (i.e. your inference session -//! must be configured with a 32-bit [InferenceSessionConfig]). -//! -//! This model will not be generally available in the `llm` ecosystem until Falcon 40B and 16-bit memory is -//! supported. It is currently only available as a preview. -#![deny(missing_docs)] - -use ggml::Tensor; -use llm_base::{ - ggml, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, -}; - -/// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae) -/// -/// # Safety -/// This implements [Send] and [Sync] as it is immutable after construction. -pub struct Falcon { - params: ModelParameters, - - hyperparameters: Hyperparameters, - - tokenizer: Tokenizer, - - // model-global weights - // weighted token embeddings - tok_embeddings: Tensor, - output_norm: Tensor, - output_norm_b: Tensor, - lm_head: Tensor, - - // weights for the model - layers: Vec, - - // must be kept alive for the model - context: ModelContext, -} - -unsafe impl Send for Falcon {} -unsafe impl Sync for Falcon {} - -impl KnownModel for Falcon { - type Hyperparameters = Hyperparameters; - - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl llm_base::TensorLoader, - ) -> Result { - let mut tl = tensor_loader; - - // model-gobal weights - let tok_embeddings = tl.load("transformer.word_embeddings.weight")?; - - let backend = params.backend(0); - - let output_norm = tl.load("transformer.ln_f.weight")?.transfer_to(backend); - let output_norm_b = tl.load("transformer.ln_f.bias")?.transfer_to(backend); - let lm_head = tl.load("lm_head.weight")?.transfer_to(backend); - - let mut layers = Vec::new(); - // utilizing n_head_kv to determine the model version (parameters) - let Hyperparameters { n_head_kv, .. } = hyperparameters; - for i in 0..hyperparameters.n_layer { - let backend = params.backend(i); - - let (input_layernorm_name, attention_norm_name) = if n_head_kv == 1 { - // falcon 7b - (format!("transformer.h.{i}.input_layernorm"), None) - } else { - // falcon 40b - ( - format!("transformer.h.{i}.ln_mlp"), - Some(format!("transformer.h.{i}.ln_attn")), - ) - }; - - let (attention_norm_weight, attention_norm_bias) = - if let Some(norm_name) = attention_norm_name { - ( - Some( - tl.load(&format!("{}.weight", norm_name))? - .transfer_to(backend), - ), - Some( - tl.load(&format!("{}.bias", norm_name))? - .transfer_to(backend), - ), - ) - } else { - (None, None) - }; - - let layer = Layer { - input_layernorm: tl - .load(&format!("{}.weight", input_layernorm_name))? - .transfer_to(backend), - input_layernorm_b: tl - .load(&format!("{}.bias", input_layernorm_name))? - .transfer_to(backend), - attention_norm: attention_norm_weight, - attention_norm_b: attention_norm_bias, - query_key_value: tl - .load(&format!( - "transformer.h.{i}.self_attention.query_key_value.weight" - ))? - .transfer_to(backend), - wo: tl - .load(&format!("transformer.h.{i}.self_attention.dense.weight"))? - .transfer_to(backend), - - ffn_up: tl - .load(&format!("transformer.h.{i}.mlp.dense_h_to_4h.weight"))? - .transfer_to(backend), - ffn_down: tl - .load(&format!("transformer.h.{i}.mlp.dense_4h_to_h.weight"))? - .transfer_to(backend), - }; - - layers.push(layer); - } - - let context = tl.finish(); - - Ok(Falcon { - hyperparameters, - params, - tokenizer, - tok_embeddings, - output_norm, - output_norm_b, - lm_head, - layers, - context, - }) - } - - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - InferenceSession::new( - config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, - ) - } - - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - let ctx_size = self.params.context_size; - - let Hyperparameters { - n_embd, - n_head, - n_head_kv, - n_vocab, - n_layer, - .. - } = self.hyperparameters; - - let head_dim = n_embd / n_head; - - let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let input_len = builder.input_length(); - let n = input_len; - let session_len = builder.n_past; - - let mut ctx0 = builder.ctx0.borrow_mut(); - let embd = builder.embd; - let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd); - - let f32_size = std::mem::size_of::(); - - let memory_k = builder.memory_k; - let memory_k_size = memory_k.element_size(); - - let memory_v = builder.memory_v; - let memory_v_size = memory_v.element_size(); - - let mut gf = ctx0.create_compute_graph(); - - let mut current: Tensor; - let mut layernorm_output: Tensor; - - for il in 0..n_layer { - // attention uses first scratch buffer - ctx0.set_offloading(self.params.should_offload(il)); - - // self-attention - layernorm_output = ctx0.op_norm(&input_layer); - layernorm_output = ctx0.op_add( - &ctx0.op_mul(&layernorm_output, &self.layers[il].input_layernorm), - &self.layers[il].input_layernorm_b, - ); - - if n_head_kv == 1 { - // Falcon-7B only - current = layernorm_output.share(); - } else { - // Falcon-40B only - current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( - &ctx0.op_mul(¤t, self.layers[il].attention_norm.as_ref().unwrap()), - self.layers[il].attention_norm_b.as_ref().unwrap(), - ); - } - - // compute QKV - current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); - - let fused_qkv_row_nb = head_dim * (n_head + 2 * n_head_kv) * f32_size; - - let mut qcur = ctx0.op_view_3d( - ¤t, - (head_dim, n_head, n), - (head_dim * f32_size, fused_qkv_row_nb), - 0, - ); - - let mut kcur = ctx0.op_view_3d( - ¤t, - (head_dim, n_head_kv, n), - (head_dim * f32_size, fused_qkv_row_nb), - head_dim * n_head * f32_size, - ); - - let vcur = ctx0.op_view_3d( - ¤t, - (head_dim, n_head_kv, n), - (head_dim * f32_size, fused_qkv_row_nb), - head_dim * (n_head + n_head_kv) * f32_size, - ); - - // using mode = 2 for neox mode - let overrides = self.params.rope_overrides.as_ref(); - qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2, overrides); - kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2, overrides); - - // store key and value to memory - - let k = ctx0.op_view_1d( - memory_k, - n * n_head_kv * head_dim, - (memory_k_size * n_head_kv * head_dim) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_1d( - memory_v, - n * n_head_kv * head_dim, - (memory_v_size * n_head_kv * head_dim) * (il * ctx_size + session_len), - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - let bigq = ctx0.op_permute(&qcur, (0, 2, 1, 3)); - - let bigk = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - memory_k, - (session_len + n) * n_head_kv * head_dim, - il * ctx_size * memory_k_size * n_head_kv * head_dim, - ), - head_dim, - n_head_kv, - session_len + n, - ), - (0, 2, 1, 3), - ); - - // K * Q - let big_kq = ctx0.op_mul_mat(&bigk, &bigq); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - let big_kq_scaled = ctx0.op_scale_inplace( - &big_kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - let big_kq_masked = ctx0.op_diag_mask_inf_inplace(&big_kq_scaled, session_len); - - let big_kq_softmax = ctx0.op_soft_max_inplace(&big_kq_masked); - - let mut bigv = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - memory_v, - (session_len + n) * n_head_kv * head_dim, - il * ctx_size * memory_v_size * n_head_kv * head_dim, - ), - head_dim, - n_head_kv, - session_len + n, - ), - (0, 2, 1, 3), - ); - bigv = ctx0.op_cont(&ctx0.op_transpose(&bigv)); - - let big_kqv = ctx0.op_mul_mat(&bigv, &big_kq_softmax); - // KQV_merged = KQV.permute(0, 2, 1, 3) - let big_kqv_merged = ctx0.op_permute(&big_kqv, (0, 2, 1, 3)); - - // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy( - &big_kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n), - ); - - // projection - current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); - - let inp_ff = layernorm_output.share(); - let attn_out = - ctx0.op_cpy(¤t, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); - - current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inp_ff); - current = ctx0.op_gelu(¤t); - current = ctx0.op_mul_mat(&self.layers[il].ffn_down, ¤t); - - current = ctx0.op_add(¤t, &attn_out); - current = ctx0.op_add(¤t, &input_layer); - - input_layer = current.share(); - } - - // norm - input_layer = ctx0.op_norm(&input_layer); - - input_layer = ctx0.op_add( - &ctx0.op_mul(&input_layer, &self.output_norm), - &self.output_norm_b, - ); - - let embeddings_tensor: ggml::Tensor = input_layer.share(); - - ctx0.set_offloading(false); - - // lm_head - input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer); - - ( - gf, - GraphOutputs { - result: input_layer, - embedding_result: embeddings_tensor, - output_length: n, - }, - ) - }); - - // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); - common::extract_logits( - output_request, - &outputs.result, - n_vocab, - outputs.output_length, - ); - common::extract_embeddings( - output_request, - &outputs.embedding_result, - n_embd, - outputs.output_length, - ); - } - - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size - } - - fn bot_token_id(&self) -> Option { - None - } - - fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() - } - - fn quantize_tensors() -> Vec { - vec![Regex::new(".*weight").unwrap()] - } - - fn skip_quantize_tensors() -> Vec { - vec![] - } -} - -/// Falcon [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Clone, Copy, Eq)] -pub struct Hyperparameters { - /// Size of the model's vocabulary - n_vocab: usize, - /// Size of the model's embedding layer - n_embd: usize, - /// n_heads - n_head: usize, - // Number of heads for key-value pairs - n_head_kv: usize, - /// Number of layers in the model - n_layer: usize, - /// file_type - file_type: FileType, -} - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - let hyperparameters = Hyperparameters { - n_vocab: util::read_i32(reader)?.try_into()?, - n_embd: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_head_kv: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - file_type: util::read_filetype(reader)?, - }; - - Ok(hyperparameters) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_head_kv.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.file_type.into())?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } -} - -struct Layer { - // normalization - input_layernorm: Tensor, - input_layernorm_b: Tensor, - - // Falcon-40B only - attention_norm: Option, - attention_norm_b: Option, - - // attention - query_key_value: Tensor, - wo: Tensor, - - // ff - ffn_up: Tensor, - ffn_down: Tensor, -} +// //! An implementation of the [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem. +// //! +// //! This implementation only works for Falcon 7B, and with 32-bit memory tensors (i.e. your inference session +// //! must be configured with a 32-bit [InferenceSessionConfig]). +// //! +// //! This model will not be generally available in the `llm` ecosystem until Falcon 40B and 16-bit memory is +// //! supported. It is currently only available as a preview. +// #![deny(missing_docs)] + +// use ggml::Tensor; +// use llm_base::{ +// ggml, +// model::{common, HyperparametersWriteError}, +// util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, LoadError, +// ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, +// }; + +// /// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae) +// /// +// /// # Safety +// /// This implements [Send] and [Sync] as it is immutable after construction. +// pub struct Falcon { +// params: ModelParameters, + +// hyperparameters: Hyperparameters, + +// tokenizer: Tokenizer, + +// // model-global weights +// // weighted token embeddings +// tok_embeddings: Tensor, +// output_norm: Tensor, +// output_norm_b: Tensor, +// lm_head: Tensor, + +// // weights for the model +// layers: Vec, + +// // must be kept alive for the model +// context: ModelContext, +// } + +// unsafe impl Send for Falcon {} +// unsafe impl Sync for Falcon {} + +// impl Model for Falcon { +// type Hyperparameters = Hyperparameters; + +// fn new( +// hyperparameters: Self::Hyperparameters, +// params: ModelParameters, +// tokenizer: Tokenizer, +// tensor_loader: impl llm_base::TensorLoader, +// ) -> Result { +// let mut tl = tensor_loader; + +// // model-gobal weights +// let tok_embeddings = tl.load("transformer.word_embeddings.weight")?; + +// let backend = params.backend(0); + +// let output_norm = tl.load("transformer.ln_f.weight")?.transfer_to(backend); +// let output_norm_b = tl.load("transformer.ln_f.bias")?.transfer_to(backend); +// let lm_head = tl.load("lm_head.weight")?.transfer_to(backend); + +// let mut layers = Vec::new(); +// // utilizing n_head_kv to determine the model version (parameters) +// let Hyperparameters { n_head_kv, .. } = hyperparameters; +// for i in 0..hyperparameters.n_layer { +// let backend = params.backend(i); + +// let (input_layernorm_name, attention_norm_name) = if n_head_kv == 1 { +// // falcon 7b +// (format!("transformer.h.{i}.input_layernorm"), None) +// } else { +// // falcon 40b +// ( +// format!("transformer.h.{i}.ln_mlp"), +// Some(format!("transformer.h.{i}.ln_attn")), +// ) +// }; + +// let (attention_norm_weight, attention_norm_bias) = +// if let Some(norm_name) = attention_norm_name { +// ( +// Some( +// tl.load(&format!("{}.weight", norm_name))? +// .transfer_to(backend), +// ), +// Some( +// tl.load(&format!("{}.bias", norm_name))? +// .transfer_to(backend), +// ), +// ) +// } else { +// (None, None) +// }; + +// let layer = Layer { +// input_layernorm: tl +// .load(&format!("{}.weight", input_layernorm_name))? +// .transfer_to(backend), +// input_layernorm_b: tl +// .load(&format!("{}.bias", input_layernorm_name))? +// .transfer_to(backend), +// attention_norm: attention_norm_weight, +// attention_norm_b: attention_norm_bias, +// query_key_value: tl +// .load(&format!( +// "transformer.h.{i}.self_attention.query_key_value.weight" +// ))? +// .transfer_to(backend), +// wo: tl +// .load(&format!("transformer.h.{i}.self_attention.dense.weight"))? +// .transfer_to(backend), + +// ffn_up: tl +// .load(&format!("transformer.h.{i}.mlp.dense_h_to_4h.weight"))? +// .transfer_to(backend), +// ffn_down: tl +// .load(&format!("transformer.h.{i}.mlp.dense_4h_to_h.weight"))? +// .transfer_to(backend), +// }; + +// layers.push(layer); +// } + +// let context = tl.finish(); + +// Ok(Falcon { +// hyperparameters, +// params, +// tokenizer, +// tok_embeddings, +// output_norm, +// output_norm_b, +// lm_head, +// layers, +// context, +// }) +// } + +// fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { +// InferenceSession::new( +// config, +// &self.params, +// self.hyperparameters.n_layer, +// self.hyperparameters.n_embd, +// self.hyperparameters.n_vocab, +// ) +// } + +// fn evaluate( +// &self, +// session: &mut InferenceSession, +// input_tokens: &[TokenId], +// output_request: &mut OutputRequest, +// ) { +// let ctx_size = self.params.context_size; + +// let Hyperparameters { +// n_embd, +// n_head, +// n_head_kv, +// n_vocab, +// n_layer, +// .. +// } = self.hyperparameters; + +// let head_dim = n_embd / n_head; + +// let outputs = session.compute(self.context.clone(), input_tokens, |builder| { +// let input_len = builder.input_length(); +// let n = input_len; +// let session_len = builder.n_past; + +// let mut ctx0 = builder.ctx0.borrow_mut(); +// let embd = builder.embd; +// let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd); + +// let f32_size = std::mem::size_of::(); + +// let memory_k = builder.memory_k; +// let memory_k_size = memory_k.element_size(); + +// let memory_v = builder.memory_v; +// let memory_v_size = memory_v.element_size(); + +// let mut gf = ctx0.create_compute_graph(); + +// let mut current: Tensor; +// let mut layernorm_output: Tensor; + +// for il in 0..n_layer { +// // attention uses first scratch buffer +// ctx0.set_offloading(self.params.should_offload(il)); + +// // self-attention +// layernorm_output = ctx0.op_norm(&input_layer); +// layernorm_output = ctx0.op_add( +// &ctx0.op_mul(&layernorm_output, &self.layers[il].input_layernorm), +// &self.layers[il].input_layernorm_b, +// ); + +// if n_head_kv == 1 { +// // Falcon-7B only +// current = layernorm_output.share(); +// } else { +// // Falcon-40B only +// current = ctx0.op_norm(&input_layer); +// current = ctx0.op_add( +// &ctx0.op_mul(¤t, self.layers[il].attention_norm.as_ref().unwrap()), +// self.layers[il].attention_norm_b.as_ref().unwrap(), +// ); +// } + +// // compute QKV +// current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); + +// let fused_qkv_row_nb = head_dim * (n_head + 2 * n_head_kv) * f32_size; + +// let mut qcur = ctx0.op_view_3d( +// ¤t, +// (head_dim, n_head, n), +// (head_dim * f32_size, fused_qkv_row_nb), +// 0, +// ); + +// let mut kcur = ctx0.op_view_3d( +// ¤t, +// (head_dim, n_head_kv, n), +// (head_dim * f32_size, fused_qkv_row_nb), +// head_dim * n_head * f32_size, +// ); + +// let vcur = ctx0.op_view_3d( +// ¤t, +// (head_dim, n_head_kv, n), +// (head_dim * f32_size, fused_qkv_row_nb), +// head_dim * (n_head + n_head_kv) * f32_size, +// ); + +// // using mode = 2 for neox mode +// let overrides = self.params.rope_overrides.as_ref(); +// qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2, overrides); +// kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2, overrides); + +// // store key and value to memory + +// let k = ctx0.op_view_1d( +// memory_k, +// n * n_head_kv * head_dim, +// (memory_k_size * n_head_kv * head_dim) * (il * ctx_size + session_len), +// ); +// let v = ctx0.op_view_1d( +// memory_v, +// n * n_head_kv * head_dim, +// (memory_v_size * n_head_kv * head_dim) * (il * ctx_size + session_len), +// ); + +// gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); +// gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + +// // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) +// let bigq = ctx0.op_permute(&qcur, (0, 2, 1, 3)); + +// let bigk = ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// memory_k, +// (session_len + n) * n_head_kv * head_dim, +// il * ctx_size * memory_k_size * n_head_kv * head_dim, +// ), +// head_dim, +// n_head_kv, +// session_len + n, +// ), +// (0, 2, 1, 3), +// ); + +// // K * Q +// let big_kq = ctx0.op_mul_mat(&bigk, &bigq); + +// // KQ_scaled = KQ / sqrt(n_embd/n_head) +// let big_kq_scaled = ctx0.op_scale_inplace( +// &big_kq, +// &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), +// ); + +// let big_kq_masked = ctx0.op_diag_mask_inf_inplace(&big_kq_scaled, session_len); + +// let big_kq_softmax = ctx0.op_soft_max_inplace(&big_kq_masked); + +// let mut bigv = ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// memory_v, +// (session_len + n) * n_head_kv * head_dim, +// il * ctx_size * memory_v_size * n_head_kv * head_dim, +// ), +// head_dim, +// n_head_kv, +// session_len + n, +// ), +// (0, 2, 1, 3), +// ); +// bigv = ctx0.op_cont(&ctx0.op_transpose(&bigv)); + +// let big_kqv = ctx0.op_mul_mat(&bigv, &big_kq_softmax); +// // KQV_merged = KQV.permute(0, 2, 1, 3) +// let big_kqv_merged = ctx0.op_permute(&big_kqv, (0, 2, 1, 3)); + +// // cur = KQV_merged.contiguous().view(n_embd, N) +// current = ctx0.op_cpy( +// &big_kqv_merged, +// &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n), +// ); + +// // projection +// current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); + +// let inp_ff = layernorm_output.share(); +// let attn_out = +// ctx0.op_cpy(¤t, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); + +// current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inp_ff); +// current = ctx0.op_gelu(¤t); +// current = ctx0.op_mul_mat(&self.layers[il].ffn_down, ¤t); + +// current = ctx0.op_add(¤t, &attn_out); +// current = ctx0.op_add(¤t, &input_layer); + +// input_layer = current.share(); +// } + +// // norm +// input_layer = ctx0.op_norm(&input_layer); + +// input_layer = ctx0.op_add( +// &ctx0.op_mul(&input_layer, &self.output_norm), +// &self.output_norm_b, +// ); + +// let embeddings_tensor: ggml::Tensor = input_layer.share(); + +// ctx0.set_offloading(false); + +// // lm_head +// input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer); + +// ( +// gf, +// GraphOutputs { +// result: input_layer, +// embedding_result: embeddings_tensor, +// output_length: n, +// }, +// ) +// }); + +// // finish evaluation +// common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); +// common::extract_logits( +// output_request, +// &outputs.result, +// n_vocab, +// outputs.output_length, +// ); +// common::extract_embeddings( +// output_request, +// &outputs.embedding_result, +// n_embd, +// outputs.output_length, +// ); +// } + +// fn hyperparameters(&self) -> &Self::Hyperparameters { +// &self.hyperparameters +// } + +// fn tokenizer(&self) -> &Tokenizer { +// &self.tokenizer +// } + +// fn context_size(&self) -> usize { +// self.params.context_size +// } + +// fn bot_token_id(&self) -> Option { +// None +// } + +// fn eot_token_id(&self) -> TokenId { +// self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() +// } + +// fn quantize_tensors() -> Vec { +// vec![Regex::new(".*weight").unwrap()] +// } + +// fn skip_quantize_tensors() -> Vec { +// vec![] +// } +// } + +// /// Falcon [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +// #[derive(Debug, Default, PartialEq, Clone, Copy, Eq)] +// pub struct Hyperparameters { +// /// Size of the model's vocabulary +// n_vocab: usize, +// /// Size of the model's embedding layer +// n_embd: usize, +// /// n_heads +// n_head: usize, +// // Number of heads for key-value pairs +// n_head_kv: usize, +// /// Number of layers in the model +// n_layer: usize, +// /// file_type +// file_type: FileType, +// } + +// impl llm_base::Hyperparameters for Hyperparameters { +// fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { +// let hyperparameters = Hyperparameters { +// n_vocab: util::read_i32(reader)?.try_into()?, +// n_embd: util::read_i32(reader)?.try_into()?, +// n_head: util::read_i32(reader)?.try_into()?, +// n_head_kv: util::read_i32(reader)?.try_into()?, +// n_layer: util::read_i32(reader)?.try_into()?, +// file_type: util::read_filetype(reader)?, +// }; + +// Ok(hyperparameters) +// } + +// fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// util::write_i32(writer, self.n_embd.try_into()?)?; +// util::write_i32(writer, self.n_head.try_into()?)?; +// util::write_i32(writer, self.n_head_kv.try_into()?)?; +// util::write_i32(writer, self.n_layer.try_into()?)?; +// util::write_i32(writer, self.file_type.into())?; +// Ok(()) +// } + +// fn n_vocabulary(&self) -> usize { +// self.n_vocab +// } + +// fn file_type(&self) -> Option { +// Some(self.file_type) +// } + +// fn file_type_mut(&mut self) -> Option<&mut FileType> { +// Some(&mut self.file_type) +// } +// } + +// struct Layer { +// // normalization +// input_layernorm: Tensor, +// input_layernorm_b: Tensor, + +// // Falcon-40B only +// attention_norm: Option, +// attention_norm_b: Option, + +// // attention +// query_key_value: Tensor, +// wo: Tensor, + +// // ff +// ffn_up: Tensor, +// ffn_down: Tensor, +// } diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index d06eb1ec..dfac1064 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -1,470 +1,470 @@ -//! An implementation of [GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2) for the `llm` ecosystem. -#![deny(missing_docs)] - -use ggml::Tensor; -use llm_base::{ - ggml, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, -}; - -/// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/) -/// -/// # Safety -/// This implements [Send] and [Sync] as it is immutable after construction. -pub struct Gpt2 { - params: ModelParameters, - - hyperparameters: Hyperparameters, - tokenizer: Tokenizer, - - // model-global weights - // normalization gain & bias - ln_f_g: Tensor, - ln_f_b: Tensor, - // weighted token embeddings - wte: Tensor, - // weighted positional encodings - wpe: Tensor, - // language model head - // - // Optional: if not present, the `wte` tensor is used instead. - lm_head: Option, - - // weights for the model - layers: Vec, - - // must be kept alive for the model - context: ModelContext, -} - -unsafe impl Send for Gpt2 {} -unsafe impl Sync for Gpt2 {} - -impl KnownModel for Gpt2 { - type Hyperparameters = Hyperparameters; - - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl llm_base::TensorLoader, - ) -> Result { - let mut tl = tensor_loader; - - // model-global weights - let backend = params.backend(0); - - let wpe = tl.load("model/wpe")?.transfer_to(backend); - let wte = tl.load("model/wte")?.transfer_to(backend); - - let ln_f_g = tl.load("model/ln_f/g")?.transfer_to(backend); - let ln_f_b = tl.load("model/ln_f/b")?.transfer_to(backend); - - // GPT-2's language model head is optional; if it is not present, - // the `wte` tensor is used instead. - let lm_head = { - if let Ok(tensor) = tl.load("model/lm_head") { - Some(tensor.transfer_to(backend)) - } else { - None - } - }; - - let mut layers = Vec::new(); - for i in 0..hyperparameters.n_layer { - let backend = params.backend(i); - let layer = Layer { - ln_1_g: tl.load(&format!("model/h{i}/ln_1/g"))?.transfer_to(backend), - ln_1_b: tl.load(&format!("model/h{i}/ln_1/b"))?.transfer_to(backend), - ln_2_g: tl.load(&format!("model/h{i}/ln_2/g"))?.transfer_to(backend), - ln_2_b: tl.load(&format!("model/h{i}/ln_2/b"))?.transfer_to(backend), - c_attn_attn_w: tl - .load(&format!("model/h{i}/attn/c_attn/w"))? - .transfer_to(backend), - c_attn_attn_b: tl - .load(&format!("model/h{i}/attn/c_attn/b"))? - .transfer_to(backend), - c_attn_proj_w: tl - .load(&format!("model/h{i}/attn/c_proj/w"))? - .transfer_to(backend), - c_attn_proj_b: tl - .load(&format!("model/h{i}/attn/c_proj/b"))? - .transfer_to(backend), - c_mlp_fc_w: tl - .load(&format!("model/h{i}/mlp/c_fc/w"))? - .transfer_to(backend), - c_mlp_fc_b: tl - .load(&format!("model/h{i}/mlp/c_fc/b"))? - .transfer_to(backend), - c_mlp_proj_w: tl - .load(&format!("model/h{i}/mlp/c_proj/w"))? - .transfer_to(backend), - c_mlp_proj_b: tl - .load(&format!("model/h{i}/mlp/c_proj/b"))? - .transfer_to(backend), - }; - - layers.push(layer); - } - - let context = tl.finish(); - - Ok(Gpt2 { - hyperparameters, - params, - tokenizer, - layers, - ln_f_g, - ln_f_b, - wte, - wpe, - lm_head, - context, - }) - } - - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - InferenceSession::new( - config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, - ) - } - - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - let ctx_size = self.params.context_size; - - let Hyperparameters { - n_embd, - n_head, - n_vocab, - n_layer, - .. - } = self.hyperparameters; - - let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let input_len = builder.input_length(); - let session_len = builder.n_past; - let mut ctx0 = builder.ctx0.borrow_mut(); - let (memory_k_size, memory_v_size) = ( - builder.memory_k.element_size(), - builder.memory_v.element_size(), - ); - let embd = &builder.embd; - - let position_buf: Vec = (0..input_len).map(|i| (session_len + i) as i32).collect(); - - let mut position = ctx0.new_tensor_1d(ggml::Type::I32, input_len); - unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) }; - - let mut input_layer = ctx0.op_add( - &ctx0.op_get_rows(&self.wte, embd), - &ctx0.op_get_rows(&self.wpe, &position), - ); - - let mut gf = ctx0.create_compute_graph(); - for il in 0..n_layer { - ctx0.set_offloading(self.params.should_offload(il)); - - // norm - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( - &ctx0.op_mul(¤t, &self.layers[il].ln_1_g), - &self.layers[il].ln_1_b, - ); - - // attn - current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_attn_attn_b); - - // self-attn - let nb = current.get_nb()[1]; - let f32_size = std::mem::size_of::(); - let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); - let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); - let vcur = - ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); - - if input_len >= 1 { - let k = ctx0.op_view_1d( - builder.memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_1d( - builder.memory_v, - input_len * n_embd, - (memory_v_size * n_embd) * (il * ctx_size + session_len), - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - } - - let q = ctx0.op_permute( - &ctx0.op_cpy( - &qcur, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), - ), - (0, 2, 1, 3), - ); - - let k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, - ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - - let kq = ctx0.op_mul_mat(&k, &q); - let kq_scaled = ctx0.op_scale_inplace( - &kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); - let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); - - let v_trans = ctx0.op_cpy( - &ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_v, - (session_len + input_len) * n_embd, - il * ctx_size * memory_v_size * n_embd, - ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (1, 2, 0, 3), - ), - &ctx0.new_tensor_3d( - builder.memory_v.get_type(), - session_len + input_len, - n_embd / n_head, - n_head, - ), - ); - - let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); - let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - - current = ctx0.op_cpy( - &kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - - // projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_attn_proj_b); - - // add input - current = ctx0.op_add(¤t, &input_layer); - - // feed-forward - let ff_in = current.share(); - - // feed-forward normalization - current = ctx0.op_norm(&ff_in); - current = ctx0.op_add( - &ctx0.op_mul(¤t, &self.layers[il].ln_2_g), - &self.layers[il].ln_2_b, - ); - - // feed-forward fully connected - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_mlp_fc_b); - - // feed-forward activation - current = ctx0.op_gelu(¤t); - - // feed-forward projection - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_mlp_proj_b); - - // input for next layer - input_layer = ctx0.op_add(¤t, &ff_in); - } - - // normalization - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b); - - ctx0.set_offloading(false); - - let embeddings_tensor: ggml::Tensor = input_layer.share(); - - let head = self.lm_head.as_ref().unwrap_or(&self.wte); - input_layer = ctx0.op_mul_mat(head, &input_layer); - - ( - gf, - GraphOutputs { - result: input_layer, - embedding_result: embeddings_tensor, - output_length: input_len, - }, - ) - }); - - // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); - common::extract_logits( - output_request, - &outputs.result, - n_vocab, - outputs.output_length, - ); - common::extract_embeddings( - output_request, - &outputs.embedding_result, - n_embd, - outputs.output_length, - ); - } - - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size - } - - fn bot_token_id(&self) -> Option { - None - } - - fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() - } - - fn quantize_tensors() -> Vec { - [ - "model/wte", - "model/lm_head", - "model/h.*/attn/c_attn/w", - "model/h.*/attn/c_proj/w", - "model/h.*/mlp/c_fc/w", - "model/h.*/mlp/c_proj/w", - ] - .into_iter() - .map(|s| Regex::new(s).unwrap()) - .collect() - } - - fn skip_quantize_tensors() -> Vec { - vec![] - } -} - -/// GPT-2 [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct Hyperparameters { - /// Size of the model's vocabulary - n_vocab: usize, - /// Size of the model's context - n_ctx: usize, - /// Size of the model's embedding layer - n_embd: usize, - /// n_head - n_head: usize, - /// Number of layers in the model - n_layer: usize, - /// file type - file_type: FileType, -} - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - let hyperparameters = Hyperparameters { - n_vocab: util::read_i32(reader)?.try_into()?, - n_ctx: util::read_i32(reader)?.try_into()?, - n_embd: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - file_type: util::read_filetype(reader)?, - }; - - let n_vocab = util::read_i32(reader)? as usize; - if hyperparameters.n_vocab != n_vocab { - return Err(LoadError::InvariantBroken { - path: None, - invariant: format!( - "GPT2 model expected n_vocab {} found {}", - hyperparameters.n_vocab, n_vocab - ), - }); - } - - Ok(hyperparameters) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_ctx.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.file_type.into())?; - util::write_i32(writer, self.n_vocab.try_into()?)?; - - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } -} - -struct Layer { - // normalization - ln_1_g: Tensor, - ln_1_b: Tensor, - - ln_2_g: Tensor, - ln_2_b: Tensor, - - // attention - c_attn_attn_w: Tensor, - c_attn_attn_b: Tensor, - - c_attn_proj_w: Tensor, - c_attn_proj_b: Tensor, - - // mlp - c_mlp_fc_w: Tensor, - c_mlp_fc_b: Tensor, - - c_mlp_proj_w: Tensor, - c_mlp_proj_b: Tensor, -} +// //! An implementation of [GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2) for the `llm` ecosystem. +// #![deny(missing_docs)] + +// use ggml::Tensor; +// use llm_base::{ +// ggml, +// model::{common, HyperparametersWriteError}, +// util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, LoadError, +// ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, +// }; + +// /// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/) +// /// +// /// # Safety +// /// This implements [Send] and [Sync] as it is immutable after construction. +// pub struct Gpt2 { +// params: ModelParameters, + +// hyperparameters: Hyperparameters, +// tokenizer: Tokenizer, + +// // model-global weights +// // normalization gain & bias +// ln_f_g: Tensor, +// ln_f_b: Tensor, +// // weighted token embeddings +// wte: Tensor, +// // weighted positional encodings +// wpe: Tensor, +// // language model head +// // +// // Optional: if not present, the `wte` tensor is used instead. +// lm_head: Option, + +// // weights for the model +// layers: Vec, + +// // must be kept alive for the model +// context: ModelContext, +// } + +// unsafe impl Send for Gpt2 {} +// unsafe impl Sync for Gpt2 {} + +// impl Model for Gpt2 { +// type Hyperparameters = Hyperparameters; + +// fn new( +// hyperparameters: Self::Hyperparameters, +// params: ModelParameters, +// tokenizer: Tokenizer, +// tensor_loader: impl llm_base::TensorLoader, +// ) -> Result { +// let mut tl = tensor_loader; + +// // model-global weights +// let backend = params.backend(0); + +// let wpe = tl.load("model/wpe")?.transfer_to(backend); +// let wte = tl.load("model/wte")?.transfer_to(backend); + +// let ln_f_g = tl.load("model/ln_f/g")?.transfer_to(backend); +// let ln_f_b = tl.load("model/ln_f/b")?.transfer_to(backend); + +// // GPT-2's language model head is optional; if it is not present, +// // the `wte` tensor is used instead. +// let lm_head = { +// if let Ok(tensor) = tl.load("model/lm_head") { +// Some(tensor.transfer_to(backend)) +// } else { +// None +// } +// }; + +// let mut layers = Vec::new(); +// for i in 0..hyperparameters.n_layer { +// let backend = params.backend(i); +// let layer = Layer { +// ln_1_g: tl.load(&format!("model/h{i}/ln_1/g"))?.transfer_to(backend), +// ln_1_b: tl.load(&format!("model/h{i}/ln_1/b"))?.transfer_to(backend), +// ln_2_g: tl.load(&format!("model/h{i}/ln_2/g"))?.transfer_to(backend), +// ln_2_b: tl.load(&format!("model/h{i}/ln_2/b"))?.transfer_to(backend), +// c_attn_attn_w: tl +// .load(&format!("model/h{i}/attn/c_attn/w"))? +// .transfer_to(backend), +// c_attn_attn_b: tl +// .load(&format!("model/h{i}/attn/c_attn/b"))? +// .transfer_to(backend), +// c_attn_proj_w: tl +// .load(&format!("model/h{i}/attn/c_proj/w"))? +// .transfer_to(backend), +// c_attn_proj_b: tl +// .load(&format!("model/h{i}/attn/c_proj/b"))? +// .transfer_to(backend), +// c_mlp_fc_w: tl +// .load(&format!("model/h{i}/mlp/c_fc/w"))? +// .transfer_to(backend), +// c_mlp_fc_b: tl +// .load(&format!("model/h{i}/mlp/c_fc/b"))? +// .transfer_to(backend), +// c_mlp_proj_w: tl +// .load(&format!("model/h{i}/mlp/c_proj/w"))? +// .transfer_to(backend), +// c_mlp_proj_b: tl +// .load(&format!("model/h{i}/mlp/c_proj/b"))? +// .transfer_to(backend), +// }; + +// layers.push(layer); +// } + +// let context = tl.finish(); + +// Ok(Gpt2 { +// hyperparameters, +// params, +// tokenizer, +// layers, +// ln_f_g, +// ln_f_b, +// wte, +// wpe, +// lm_head, +// context, +// }) +// } + +// fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { +// InferenceSession::new( +// config, +// &self.params, +// self.hyperparameters.n_layer, +// self.hyperparameters.n_embd, +// self.hyperparameters.n_vocab, +// ) +// } + +// fn evaluate( +// &self, +// session: &mut InferenceSession, +// input_tokens: &[TokenId], +// output_request: &mut OutputRequest, +// ) { +// let ctx_size = self.params.context_size; + +// let Hyperparameters { +// n_embd, +// n_head, +// n_vocab, +// n_layer, +// .. +// } = self.hyperparameters; + +// let outputs = session.compute(self.context.clone(), input_tokens, |builder| { +// let input_len = builder.input_length(); +// let session_len = builder.n_past; +// let mut ctx0 = builder.ctx0.borrow_mut(); +// let (memory_k_size, memory_v_size) = ( +// builder.memory_k.element_size(), +// builder.memory_v.element_size(), +// ); +// let embd = &builder.embd; + +// let position_buf: Vec = (0..input_len).map(|i| (session_len + i) as i32).collect(); + +// let mut position = ctx0.new_tensor_1d(ggml::Type::I32, input_len); +// unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) }; + +// let mut input_layer = ctx0.op_add( +// &ctx0.op_get_rows(&self.wte, embd), +// &ctx0.op_get_rows(&self.wpe, &position), +// ); + +// let mut gf = ctx0.create_compute_graph(); +// for il in 0..n_layer { +// ctx0.set_offloading(self.params.should_offload(il)); + +// // norm +// let mut current = ctx0.op_norm(&input_layer); +// current = ctx0.op_add( +// &ctx0.op_mul(¤t, &self.layers[il].ln_1_g), +// &self.layers[il].ln_1_b, +// ); + +// // attn +// current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].c_attn_attn_b); + +// // self-attn +// let nb = current.get_nb()[1]; +// let f32_size = std::mem::size_of::(); +// let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); +// let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); +// let vcur = +// ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); + +// if input_len >= 1 { +// let k = ctx0.op_view_1d( +// builder.memory_k, +// input_len * n_embd, +// (memory_k_size * n_embd) * (il * ctx_size + session_len), +// ); +// let v = ctx0.op_view_1d( +// builder.memory_v, +// input_len * n_embd, +// (memory_v_size * n_embd) * (il * ctx_size + session_len), +// ); + +// gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); +// gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); +// } + +// let q = ctx0.op_permute( +// &ctx0.op_cpy( +// &qcur, +// &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), +// ), +// (0, 2, 1, 3), +// ); + +// let k = ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_k, +// (session_len + input_len) * n_embd, +// il * ctx_size * memory_k_size * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + input_len, +// ), +// (0, 2, 1, 3), +// ); + +// let kq = ctx0.op_mul_mat(&k, &q); +// let kq_scaled = ctx0.op_scale_inplace( +// &kq, +// &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), +// ); + +// let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); +// let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); + +// let v_trans = ctx0.op_cpy( +// &ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_v, +// (session_len + input_len) * n_embd, +// il * ctx_size * memory_v_size * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + input_len, +// ), +// (1, 2, 0, 3), +// ), +// &ctx0.new_tensor_3d( +// builder.memory_v.get_type(), +// session_len + input_len, +// n_embd / n_head, +// n_head, +// ), +// ); + +// let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); +// let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + +// current = ctx0.op_cpy( +// &kqv_merged, +// &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), +// ); + +// // projection +// current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].c_attn_proj_b); + +// // add input +// current = ctx0.op_add(¤t, &input_layer); + +// // feed-forward +// let ff_in = current.share(); + +// // feed-forward normalization +// current = ctx0.op_norm(&ff_in); +// current = ctx0.op_add( +// &ctx0.op_mul(¤t, &self.layers[il].ln_2_g), +// &self.layers[il].ln_2_b, +// ); + +// // feed-forward fully connected +// current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].c_mlp_fc_b); + +// // feed-forward activation +// current = ctx0.op_gelu(¤t); + +// // feed-forward projection +// current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].c_mlp_proj_b); + +// // input for next layer +// input_layer = ctx0.op_add(¤t, &ff_in); +// } + +// // normalization +// input_layer = ctx0.op_norm(&input_layer); +// input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b); + +// ctx0.set_offloading(false); + +// let embeddings_tensor: ggml::Tensor = input_layer.share(); + +// let head = self.lm_head.as_ref().unwrap_or(&self.wte); +// input_layer = ctx0.op_mul_mat(head, &input_layer); + +// ( +// gf, +// GraphOutputs { +// result: input_layer, +// embedding_result: embeddings_tensor, +// output_length: input_len, +// }, +// ) +// }); + +// // finish evaluation +// common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); +// common::extract_logits( +// output_request, +// &outputs.result, +// n_vocab, +// outputs.output_length, +// ); +// common::extract_embeddings( +// output_request, +// &outputs.embedding_result, +// n_embd, +// outputs.output_length, +// ); +// } + +// fn hyperparameters(&self) -> &Self::Hyperparameters { +// &self.hyperparameters +// } + +// fn tokenizer(&self) -> &Tokenizer { +// &self.tokenizer +// } + +// fn context_size(&self) -> usize { +// self.params.context_size +// } + +// fn bot_token_id(&self) -> Option { +// None +// } + +// fn eot_token_id(&self) -> TokenId { +// self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() +// } + +// fn quantize_tensors() -> Vec { +// [ +// "model/wte", +// "model/lm_head", +// "model/h.*/attn/c_attn/w", +// "model/h.*/attn/c_proj/w", +// "model/h.*/mlp/c_fc/w", +// "model/h.*/mlp/c_proj/w", +// ] +// .into_iter() +// .map(|s| Regex::new(s).unwrap()) +// .collect() +// } + +// fn skip_quantize_tensors() -> Vec { +// vec![] +// } +// } + +// /// GPT-2 [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +// #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] +// pub struct Hyperparameters { +// /// Size of the model's vocabulary +// n_vocab: usize, +// /// Size of the model's context +// n_ctx: usize, +// /// Size of the model's embedding layer +// n_embd: usize, +// /// n_head +// n_head: usize, +// /// Number of layers in the model +// n_layer: usize, +// /// file type +// file_type: FileType, +// } + +// impl llm_base::Hyperparameters for Hyperparameters { +// fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { +// let hyperparameters = Hyperparameters { +// n_vocab: util::read_i32(reader)?.try_into()?, +// n_ctx: util::read_i32(reader)?.try_into()?, +// n_embd: util::read_i32(reader)?.try_into()?, +// n_head: util::read_i32(reader)?.try_into()?, +// n_layer: util::read_i32(reader)?.try_into()?, +// file_type: util::read_filetype(reader)?, +// }; + +// let n_vocab = util::read_i32(reader)? as usize; +// if hyperparameters.n_vocab != n_vocab { +// return Err(LoadError::InvariantBroken { +// path: None, +// invariant: format!( +// "GPT2 model expected n_vocab {} found {}", +// hyperparameters.n_vocab, n_vocab +// ), +// }); +// } + +// Ok(hyperparameters) +// } + +// fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// util::write_i32(writer, self.n_ctx.try_into()?)?; +// util::write_i32(writer, self.n_embd.try_into()?)?; +// util::write_i32(writer, self.n_head.try_into()?)?; +// util::write_i32(writer, self.n_layer.try_into()?)?; +// util::write_i32(writer, self.file_type.into())?; +// util::write_i32(writer, self.n_vocab.try_into()?)?; + +// Ok(()) +// } + +// fn n_vocabulary(&self) -> usize { +// self.n_vocab +// } + +// fn file_type(&self) -> Option { +// Some(self.file_type) +// } + +// fn file_type_mut(&mut self) -> Option<&mut FileType> { +// Some(&mut self.file_type) +// } +// } + +// struct Layer { +// // normalization +// ln_1_g: Tensor, +// ln_1_b: Tensor, + +// ln_2_g: Tensor, +// ln_2_b: Tensor, + +// // attention +// c_attn_attn_w: Tensor, +// c_attn_attn_b: Tensor, + +// c_attn_proj_w: Tensor, +// c_attn_proj_b: Tensor, + +// // mlp +// c_mlp_fc_w: Tensor, +// c_mlp_fc_b: Tensor, + +// c_mlp_proj_w: Tensor, +// c_mlp_proj_b: Tensor, +// } diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index b4ee3d82..9baff116 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -1,446 +1,446 @@ -//! An implementation of [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj) for the `llm` ecosystem. -#![deny(missing_docs)] - -use std::error::Error; - -use ggml::Tensor; -use llm_base::{ - ggml, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, -}; - -/// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b) -/// -/// # Safety -/// This implements [Send] and [Sync] as it is immutable after construction. -pub struct GptJ { - params: ModelParameters, - - hyperparameters: Hyperparameters, - tokenizer: Tokenizer, - - // model-global weights - // normalization gain & bias - ln_f_g: Tensor, - ln_f_b: Tensor, - // weighted token embeddings - wte: Tensor, - // language model head gain & bias - lmh_g: Tensor, - lmh_b: Tensor, - - // weights for the model - layers: Vec, - - // must be kept alive for the model - context: ModelContext, -} - -unsafe impl Send for GptJ {} -unsafe impl Sync for GptJ {} - -impl KnownModel for GptJ { - type Hyperparameters = Hyperparameters; - - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl TensorLoader, - ) -> Result - where - Self: Sized, - { - let mut tl = tensor_loader; - - // model-global weights - let wte = tl.load("transformer.wte.weight")?; - - let backend = params.backend(0); - - let ln_f_g = tl.load("transformer.ln_f.weight")?.transfer_to(backend); - let ln_f_b = tl.load("transformer.ln_f.bias")?.transfer_to(backend); - let lmh_g = tl.load("lm_head.weight")?.transfer_to(backend); - let lmh_b = tl.load("lm_head.bias")?.transfer_to(backend); - - let mut layers = Vec::new(); - for i in 0..hyperparameters.n_layer { - let backend = params.backend(i); - - let layer = Layer { - ln_1_g: tl - .load(&format!("transformer.h.{i}.ln_1.weight"))? - .transfer_to(backend), - ln_1_b: tl - .load(&format!("transformer.h.{i}.ln_1.bias"))? - .transfer_to(backend), - c_attn_q_proj_w: tl - .load(&format!("transformer.h.{i}.attn.q_proj.weight"))? - .transfer_to(backend), - c_attn_k_proj_w: tl - .load(&format!("transformer.h.{i}.attn.k_proj.weight"))? - .transfer_to(backend), - c_attn_v_proj_w: tl - .load(&format!("transformer.h.{i}.attn.v_proj.weight"))? - .transfer_to(backend), - c_attn_proj_w: tl - .load(&format!("transformer.h.{i}.attn.out_proj.weight"))? - .transfer_to(backend), - c_mlp_fc_w: tl - .load(&format!("transformer.h.{i}.mlp.fc_in.weight"))? - .transfer_to(backend), - c_mlp_fc_b: tl - .load(&format!("transformer.h.{i}.mlp.fc_in.bias"))? - .transfer_to(backend), - c_mlp_proj_w: tl - .load(&format!("transformer.h.{i}.mlp.fc_out.weight"))? - .transfer_to(backend), - c_mlp_proj_b: tl - .load(&format!("transformer.h.{i}.mlp.fc_out.bias"))? - .transfer_to(backend), - }; - - layers.push(layer); - } - - let context = tl.finish(); - - Ok(GptJ { - hyperparameters, - params, - tokenizer, - ln_f_g, - ln_f_b, - wte, - lmh_g, - lmh_b, - layers, - context, - }) - } - - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - InferenceSession::new( - config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, - ) - } - - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - let ctx_size = self.params.context_size; - - let Hyperparameters { - n_embd, - n_head, - n_vocab, - n_layer, - n_rot, - .. - } = self.hyperparameters; - - let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let input_len = builder.input_length(); - let session_len = builder.n_past; - - let mut ctx0 = builder.ctx0.borrow_mut(); - let (memory_k_size, memory_v_size) = ( - builder.memory_k.element_size(), - builder.memory_v.element_size(), - ); - let embd = builder.embd; - - let mut input_layer = ctx0.op_get_rows(&self.wte, embd); - - let mut gf = ctx0.create_compute_graph(); - for il in 0..n_layer { - ctx0.set_offloading(self.params.should_offload(il)); - - // norm - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( - &ctx0.op_mul(¤t, &self.layers[il].ln_1_g), - &self.layers[il].ln_1_b, - ); - - let input_sa = current.share(); - - // self-attention - let overrides = self.params.rope_overrides.as_ref(); - let qcur = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].c_attn_q_proj_w, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, - overrides, - ); - let kcur = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].c_attn_k_proj_w, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, - overrides, - ); - - // self-attention store key and value to memory - let vcur = - ctx0.op_transpose(&ctx0.op_mul_mat(&self.layers[il].c_attn_v_proj_w, ¤t)); - - let k = ctx0.op_view_1d( - builder.memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_2d( - builder.memory_v, - (input_len, n_embd), - ctx_size * memory_v_size, - (il * ctx_size) * memory_v_size * n_embd + session_len * memory_v_size, - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - let q = ctx0.op_permute(&qcur, (0, 2, 1, 3)); - let big_k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, - ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - - let kq = ctx0.op_mul_mat(&big_k, &q); - let kq_scaled = ctx0.op_scale_inplace( - &kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); - let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); - - let big_v = ctx0.op_view_3d( - builder.memory_v, - (session_len + input_len, n_embd / n_head, n_head), - ( - ctx_size * memory_v_size, - ctx_size * memory_v_size * n_embd / n_head, - ), - il * ctx_size * memory_v_size * n_embd, - ); - - let kqv = ctx0.op_mul_mat(&big_v, &kq_softmax); - let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - - current = ctx0.op_cpy( - &kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - - // self-attention projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); - - // feed-forward - let ff_in = current.share(); - - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, &input_sa); - current = ctx0.op_add(¤t, &self.layers[il].c_mlp_fc_b); - - current = ctx0.op_gelu(¤t); - - // feed-forward projection - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_mlp_proj_b); - - current = ctx0.op_add(¤t, &ff_in); - - // input for next layer - input_layer = ctx0.op_add(¤t, &input_layer); - } - - // norm - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b); - - let embeddings_tensor: ggml::Tensor = input_layer.share(); - - // lm_head - input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer); - - ctx0.set_offloading(false); - - input_layer = ctx0.op_add(&input_layer, &self.lmh_b); - - ( - gf, - GraphOutputs { - result: input_layer, - embedding_result: embeddings_tensor, - output_length: input_len, - }, - ) - }); - - // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); - common::extract_logits( - output_request, - &outputs.result, - n_vocab, - outputs.output_length, - ); - common::extract_embeddings( - output_request, - &outputs.embedding_result, - n_embd, - outputs.output_length, - ); - } - - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size - } - - fn bot_token_id(&self) -> Option { - None - } - - fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() - } - - fn quantize_tensors() -> Vec { - vec![Regex::new(".*weight").unwrap()] - } - - fn skip_quantize_tensors() -> Vec { - vec![] - } - - fn supports_rewind(&self) -> bool { - true - } -} - -/// GPT-J [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct Hyperparameters { - /// Size of the model's vocabulary - pub n_vocab: usize, - /// Size of the model's context - pub n_ctx: usize, - /// Size of the model's embedding layer - pub n_embd: usize, - /// n_head - pub n_head: usize, - /// Number of layers in the model - pub n_layer: usize, - /// n_rot - pub n_rot: usize, - /// file_type - pub file_type: FileType, -} - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - let hyperparameters = Hyperparameters { - n_vocab: util::read_i32(reader)?.try_into()?, - n_ctx: util::read_i32(reader)?.try_into()?, - n_embd: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - n_rot: util::read_i32(reader)?.try_into()?, - file_type: util::read_filetype(reader)?, - }; - - let n_vocab = util::read_i32(reader)? as usize; - if hyperparameters.n_vocab != n_vocab { - return Err(LoadError::InvariantBroken { - path: None, - invariant: format!( - "GPTJ model expected n_vocab {} found {}", - hyperparameters.n_vocab, n_vocab - ), - }); - } - - Ok(hyperparameters) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_ctx.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.n_rot.try_into()?)?; - util::write_i32(writer, self.file_type.into())?; - util::write_i32(writer, self.n_vocab.try_into()?)?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } -} - -struct Layer { - // normalization - ln_1_g: Tensor, - ln_1_b: Tensor, - - // attention - c_attn_q_proj_w: Tensor, - c_attn_k_proj_w: Tensor, - c_attn_v_proj_w: Tensor, - - c_attn_proj_w: Tensor, - - // ff - c_mlp_fc_w: Tensor, - c_mlp_fc_b: Tensor, - - c_mlp_proj_w: Tensor, - c_mlp_proj_b: Tensor, -} +// //! An implementation of [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj) for the `llm` ecosystem. +// #![deny(missing_docs)] + +// use std::error::Error; + +// use ggml::Tensor; +// use llm_base::{ +// ggml, +// model::{common, HyperparametersWriteError}, +// util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, LoadError, +// ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, +// }; + +// /// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b) +// /// +// /// # Safety +// /// This implements [Send] and [Sync] as it is immutable after construction. +// pub struct GptJ { +// params: ModelParameters, + +// hyperparameters: Hyperparameters, +// tokenizer: Tokenizer, + +// // model-global weights +// // normalization gain & bias +// ln_f_g: Tensor, +// ln_f_b: Tensor, +// // weighted token embeddings +// wte: Tensor, +// // language model head gain & bias +// lmh_g: Tensor, +// lmh_b: Tensor, + +// // weights for the model +// layers: Vec, + +// // must be kept alive for the model +// context: ModelContext, +// } + +// unsafe impl Send for GptJ {} +// unsafe impl Sync for GptJ {} + +// impl Model for GptJ { +// type Hyperparameters = Hyperparameters; + +// fn new( +// hyperparameters: Self::Hyperparameters, +// params: ModelParameters, +// tokenizer: Tokenizer, +// tensor_loader: impl TensorLoader, +// ) -> Result +// where +// Self: Sized, +// { +// let mut tl = tensor_loader; + +// // model-global weights +// let wte = tl.load("transformer.wte.weight")?; + +// let backend = params.backend(0); + +// let ln_f_g = tl.load("transformer.ln_f.weight")?.transfer_to(backend); +// let ln_f_b = tl.load("transformer.ln_f.bias")?.transfer_to(backend); +// let lmh_g = tl.load("lm_head.weight")?.transfer_to(backend); +// let lmh_b = tl.load("lm_head.bias")?.transfer_to(backend); + +// let mut layers = Vec::new(); +// for i in 0..hyperparameters.n_layer { +// let backend = params.backend(i); + +// let layer = Layer { +// ln_1_g: tl +// .load(&format!("transformer.h.{i}.ln_1.weight"))? +// .transfer_to(backend), +// ln_1_b: tl +// .load(&format!("transformer.h.{i}.ln_1.bias"))? +// .transfer_to(backend), +// c_attn_q_proj_w: tl +// .load(&format!("transformer.h.{i}.attn.q_proj.weight"))? +// .transfer_to(backend), +// c_attn_k_proj_w: tl +// .load(&format!("transformer.h.{i}.attn.k_proj.weight"))? +// .transfer_to(backend), +// c_attn_v_proj_w: tl +// .load(&format!("transformer.h.{i}.attn.v_proj.weight"))? +// .transfer_to(backend), +// c_attn_proj_w: tl +// .load(&format!("transformer.h.{i}.attn.out_proj.weight"))? +// .transfer_to(backend), +// c_mlp_fc_w: tl +// .load(&format!("transformer.h.{i}.mlp.fc_in.weight"))? +// .transfer_to(backend), +// c_mlp_fc_b: tl +// .load(&format!("transformer.h.{i}.mlp.fc_in.bias"))? +// .transfer_to(backend), +// c_mlp_proj_w: tl +// .load(&format!("transformer.h.{i}.mlp.fc_out.weight"))? +// .transfer_to(backend), +// c_mlp_proj_b: tl +// .load(&format!("transformer.h.{i}.mlp.fc_out.bias"))? +// .transfer_to(backend), +// }; + +// layers.push(layer); +// } + +// let context = tl.finish(); + +// Ok(GptJ { +// hyperparameters, +// params, +// tokenizer, +// ln_f_g, +// ln_f_b, +// wte, +// lmh_g, +// lmh_b, +// layers, +// context, +// }) +// } + +// fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { +// InferenceSession::new( +// config, +// &self.params, +// self.hyperparameters.n_layer, +// self.hyperparameters.n_embd, +// self.hyperparameters.n_vocab, +// ) +// } + +// fn evaluate( +// &self, +// session: &mut InferenceSession, +// input_tokens: &[TokenId], +// output_request: &mut OutputRequest, +// ) { +// let ctx_size = self.params.context_size; + +// let Hyperparameters { +// n_embd, +// n_head, +// n_vocab, +// n_layer, +// n_rot, +// .. +// } = self.hyperparameters; + +// let outputs = session.compute(self.context.clone(), input_tokens, |builder| { +// let input_len = builder.input_length(); +// let session_len = builder.n_past; + +// let mut ctx0 = builder.ctx0.borrow_mut(); +// let (memory_k_size, memory_v_size) = ( +// builder.memory_k.element_size(), +// builder.memory_v.element_size(), +// ); +// let embd = builder.embd; + +// let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + +// let mut gf = ctx0.create_compute_graph(); +// for il in 0..n_layer { +// ctx0.set_offloading(self.params.should_offload(il)); + +// // norm +// let mut current = ctx0.op_norm(&input_layer); +// current = ctx0.op_add( +// &ctx0.op_mul(¤t, &self.layers[il].ln_1_g), +// &self.layers[il].ln_1_b, +// ); + +// let input_sa = current.share(); + +// // self-attention +// let overrides = self.params.rope_overrides.as_ref(); +// let qcur = ctx0.op_rope_inplace( +// &ctx0.op_reshape_3d( +// &ctx0.op_mul_mat(&self.layers[il].c_attn_q_proj_w, ¤t), +// n_embd / n_head, +// n_head, +// input_len, +// ), +// session_len, +// n_rot, +// 0, +// overrides, +// ); +// let kcur = ctx0.op_rope_inplace( +// &ctx0.op_reshape_3d( +// &ctx0.op_mul_mat(&self.layers[il].c_attn_k_proj_w, ¤t), +// n_embd / n_head, +// n_head, +// input_len, +// ), +// session_len, +// n_rot, +// 0, +// overrides, +// ); + +// // self-attention store key and value to memory +// let vcur = +// ctx0.op_transpose(&ctx0.op_mul_mat(&self.layers[il].c_attn_v_proj_w, ¤t)); + +// let k = ctx0.op_view_1d( +// builder.memory_k, +// input_len * n_embd, +// (memory_k_size * n_embd) * (il * ctx_size + session_len), +// ); +// let v = ctx0.op_view_2d( +// builder.memory_v, +// (input_len, n_embd), +// ctx_size * memory_v_size, +// (il * ctx_size) * memory_v_size * n_embd + session_len * memory_v_size, +// ); + +// gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); +// gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + +// let q = ctx0.op_permute(&qcur, (0, 2, 1, 3)); +// let big_k = ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_k, +// (session_len + input_len) * n_embd, +// il * ctx_size * memory_k_size * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + input_len, +// ), +// (0, 2, 1, 3), +// ); + +// let kq = ctx0.op_mul_mat(&big_k, &q); +// let kq_scaled = ctx0.op_scale_inplace( +// &kq, +// &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), +// ); + +// let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); +// let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); + +// let big_v = ctx0.op_view_3d( +// builder.memory_v, +// (session_len + input_len, n_embd / n_head, n_head), +// ( +// ctx_size * memory_v_size, +// ctx_size * memory_v_size * n_embd / n_head, +// ), +// il * ctx_size * memory_v_size * n_embd, +// ); + +// let kqv = ctx0.op_mul_mat(&big_v, &kq_softmax); +// let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + +// current = ctx0.op_cpy( +// &kqv_merged, +// &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), +// ); + +// // self-attention projection +// current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); + +// // feed-forward +// let ff_in = current.share(); + +// current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, &input_sa); +// current = ctx0.op_add(¤t, &self.layers[il].c_mlp_fc_b); + +// current = ctx0.op_gelu(¤t); + +// // feed-forward projection +// current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); +// current = ctx0.op_add(¤t, &self.layers[il].c_mlp_proj_b); + +// current = ctx0.op_add(¤t, &ff_in); + +// // input for next layer +// input_layer = ctx0.op_add(¤t, &input_layer); +// } + +// // norm +// input_layer = ctx0.op_norm(&input_layer); +// input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b); + +// let embeddings_tensor: ggml::Tensor = input_layer.share(); + +// // lm_head +// input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer); + +// ctx0.set_offloading(false); + +// input_layer = ctx0.op_add(&input_layer, &self.lmh_b); + +// ( +// gf, +// GraphOutputs { +// result: input_layer, +// embedding_result: embeddings_tensor, +// output_length: input_len, +// }, +// ) +// }); + +// // finish evaluation +// common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); +// common::extract_logits( +// output_request, +// &outputs.result, +// n_vocab, +// outputs.output_length, +// ); +// common::extract_embeddings( +// output_request, +// &outputs.embedding_result, +// n_embd, +// outputs.output_length, +// ); +// } + +// fn hyperparameters(&self) -> &Self::Hyperparameters { +// &self.hyperparameters +// } + +// fn tokenizer(&self) -> &Tokenizer { +// &self.tokenizer +// } + +// fn context_size(&self) -> usize { +// self.params.context_size +// } + +// fn bot_token_id(&self) -> Option { +// None +// } + +// fn eot_token_id(&self) -> TokenId { +// self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() +// } + +// fn quantize_tensors() -> Vec { +// vec![Regex::new(".*weight").unwrap()] +// } + +// fn skip_quantize_tensors() -> Vec { +// vec![] +// } + +// fn supports_rewind(&self) -> bool { +// true +// } +// } + +// /// GPT-J [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +// #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] +// pub struct Hyperparameters { +// /// Size of the model's vocabulary +// pub n_vocab: usize, +// /// Size of the model's context +// pub n_ctx: usize, +// /// Size of the model's embedding layer +// pub n_embd: usize, +// /// n_head +// pub n_head: usize, +// /// Number of layers in the model +// pub n_layer: usize, +// /// n_rot +// pub n_rot: usize, +// /// file_type +// pub file_type: FileType, +// } + +// impl llm_base::Hyperparameters for Hyperparameters { +// fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { +// let hyperparameters = Hyperparameters { +// n_vocab: util::read_i32(reader)?.try_into()?, +// n_ctx: util::read_i32(reader)?.try_into()?, +// n_embd: util::read_i32(reader)?.try_into()?, +// n_head: util::read_i32(reader)?.try_into()?, +// n_layer: util::read_i32(reader)?.try_into()?, +// n_rot: util::read_i32(reader)?.try_into()?, +// file_type: util::read_filetype(reader)?, +// }; + +// let n_vocab = util::read_i32(reader)? as usize; +// if hyperparameters.n_vocab != n_vocab { +// return Err(LoadError::InvariantBroken { +// path: None, +// invariant: format!( +// "GPTJ model expected n_vocab {} found {}", +// hyperparameters.n_vocab, n_vocab +// ), +// }); +// } + +// Ok(hyperparameters) +// } + +// fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// util::write_i32(writer, self.n_ctx.try_into()?)?; +// util::write_i32(writer, self.n_embd.try_into()?)?; +// util::write_i32(writer, self.n_head.try_into()?)?; +// util::write_i32(writer, self.n_layer.try_into()?)?; +// util::write_i32(writer, self.n_rot.try_into()?)?; +// util::write_i32(writer, self.file_type.into())?; +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// Ok(()) +// } + +// fn n_vocabulary(&self) -> usize { +// self.n_vocab +// } + +// fn file_type(&self) -> Option { +// Some(self.file_type) +// } + +// fn file_type_mut(&mut self) -> Option<&mut FileType> { +// Some(&mut self.file_type) +// } +// } + +// struct Layer { +// // normalization +// ln_1_g: Tensor, +// ln_1_b: Tensor, + +// // attention +// c_attn_q_proj_w: Tensor, +// c_attn_k_proj_w: Tensor, +// c_attn_v_proj_w: Tensor, + +// c_attn_proj_w: Tensor, + +// // ff +// c_mlp_fc_w: Tensor, +// c_mlp_fc_b: Tensor, + +// c_mlp_proj_w: Tensor, +// c_mlp_proj_b: Tensor, +// } diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index e355fe22..6b7b0086 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -2,14 +2,15 @@ //! This crate also supports the [RedPajama](https://www.together.xyz/blog/redpajama) GPT-NeoX model. #![deny(missing_docs)] -use std::error::Error; - use ggml::Tensor; use llm_base::{ - ggml, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, + ggml::{ + self, + format::gguf::{Metadata, MetadataValue, META_TENSOR_DATA_LAYOUT}, + }, + model::{common, HyperparametersReadError, ModelData, ModelLoadArgs, ModelLoadError}, + FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, ModelContext, + OutputRequest, Regex, TokenId, }; /// The GPT-NeoX model. Ref: [GitHub](https://github.com/EleutherAI/gpt-neox) @@ -17,10 +18,8 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct GptNeoX { - params: ModelParameters, - + data: ModelData, hyperparameters: Hyperparameters, - tokenizer: Tokenizer, // model-global weights // normalization gain & bias @@ -32,7 +31,7 @@ pub struct GptNeoX { lmh_g: Tensor, // weights for the model - layers: Vec, + blocks: Vec, // must be kept alive for the model context: ModelContext, @@ -41,102 +40,82 @@ pub struct GptNeoX { unsafe impl Send for GptNeoX {} unsafe impl Sync for GptNeoX {} -impl KnownModel for GptNeoX { - type Hyperparameters = Hyperparameters; +impl Model for GptNeoX { + fn new(args: ModelLoadArgs) -> Result { + let hyperparameters = Hyperparameters::read(&args.gguf.metadata)?; - fn new( - hyperparameters: Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl TensorLoader, - ) -> Result - where - Self: Sized, - { - let mut tl = tensor_loader; + let mut tl = args.tensor_loader; // model-global weights - let wte = tl.load("gpt_neox.embed_in.weight")?; - - let backend = params.backend(0); - - let ln_f_g = tl - .load("gpt_neox.final_layer_norm.weight")? - .transfer_to(backend); - let ln_f_b = tl - .load("gpt_neox.final_layer_norm.bias")? - .transfer_to(backend); - let lmh_g = tl.load("embed_out.weight")?.transfer_to(backend); - - let mut layers = Vec::new(); - for i in 0..hyperparameters.n_layer { - let backend = params.backend(i); - let layer = Layer { + let wte = tl.load("token_embd.weight")?; + + let data = args.data; + let backend = data.params.backend(0); + + let ln_f_g = tl.load("output_norm.weight")?.transfer_to(backend); + let ln_f_b = tl.load("output_norm.bias")?.transfer_to(backend); + let lmh_g = tl.load("output.weight")?.transfer_to(backend); + + let mut blocks = Vec::new(); + for i in 0..hyperparameters.block_count { + let backend = data.params.backend(i); + let block = Block { ln_1_g: tl - .load(&format!("gpt_neox.layers.{i}.input_layernorm.weight"))? + .load(&format!("blk.{i}.attn_norm.weight"))? .transfer_to(backend), ln_1_b: tl - .load(&format!("gpt_neox.layers.{i}.input_layernorm.bias"))? + .load(&format!("blk.{i}.attn_norm.bias"))? .transfer_to(backend), c_attn_attn_w: tl - .load(&format!( - "gpt_neox.layers.{i}.attention.query_key_value.weight" - ))? + .load(&format!("blk.{i}.attn_qkv.weight"))? .transfer_to(backend), c_attn_attn_b: tl - .load(&format!( - "gpt_neox.layers.{i}.attention.query_key_value.bias" - ))? + .load(&format!("blk.{i}.attn_qkv.bias"))? .transfer_to(backend), c_attn_proj_w: tl - .load(&format!("gpt_neox.layers.{i}.attention.dense.weight"))? + .load(&format!("blk.{i}.attn_output.weight"))? .transfer_to(backend), c_attn_proj_b: tl - .load(&format!("gpt_neox.layers.{i}.attention.dense.bias"))? + .load(&format!("blk.{i}.attn_output.bias"))? .transfer_to(backend), ln_2_g: tl - .load(&format!( - "gpt_neox.layers.{i}.post_attention_layernorm.weight" - ))? + .load(&format!("blk.{i}.ffn_norm.weight"))? .transfer_to(backend), ln_2_b: tl - .load(&format!( - "gpt_neox.layers.{i}.post_attention_layernorm.bias" - ))? + .load(&format!("blk.{i}.ffn_norm.bias"))? .transfer_to(backend), c_mlp_fc_w: tl - .load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.weight"))? + .load(&format!("blk.{i}.ffn_up.weight"))? .transfer_to(backend), c_mlp_fc_b: tl - .load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.bias"))? + .load(&format!("blk.{i}.ffn_up.bias"))? .transfer_to(backend), c_mlp_proj_w: tl - .load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.weight"))? + .load(&format!("blk.{i}.ffn_down.weight"))? .transfer_to(backend), c_mlp_proj_b: tl - .load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.bias"))? + .load(&format!("blk.{i}.ffn_down.bias"))? .transfer_to(backend), }; - layers.push(layer); + blocks.push(block); } let context = tl.finish(); Ok(GptNeoX { + data, hyperparameters, - params, - tokenizer, ln_f_g, ln_f_b, wte, lmh_g, - layers, + blocks, context, }) } @@ -144,10 +123,10 @@ impl KnownModel for GptNeoX { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, + &self.data.params, + self.hyperparameters.block_count, + self.hyperparameters.embedding_length, + self.tokenizer().len(), ) } @@ -159,25 +138,29 @@ impl KnownModel for GptNeoX { input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { - let n_ctx = self.params.context_size; + let params = &self.data.params; + let ctx_size = params.context_size; + + let vocabulary_count = self.tokenizer().len(); let Hyperparameters { - n_embd, - n_head, - n_vocab, - n_layer, - n_rot, + embedding_length, + head_count, + block_count, use_parallel_residual, + rope_dimension_count, .. } = self.hyperparameters; let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let n = builder.input_length(); - let n_past = builder.n_past; + let input_len = builder.input_length(); + let session_len = builder.n_past; let mut ctx0 = builder.ctx0.borrow_mut(); let embd = builder.embd; + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + let (memory_k_size, memory_v_size) = ( builder.memory_k.element_size(), builder.memory_v.element_size(), @@ -185,61 +168,66 @@ impl KnownModel for GptNeoX { let mut gf = ctx0.create_compute_graph(); - for il in 0..n_layer { - ctx0.set_offloading(self.params.should_offload(il)); + for il in 0..block_count { + ctx0.set_offloading(params.should_offload(il)); // self-attention let mut current = ctx0.op_norm(&input_layer); current = ctx0.op_add( - &ctx0.op_mul(¤t, &self.layers[il].ln_1_g), - &self.layers[il].ln_1_b, + &ctx0.op_mul(&ctx0.op_repeat(&self.blocks[il].ln_1_g, ¤t), ¤t), + &ctx0.op_repeat(&self.blocks[il].ln_1_b, ¤t), ); // self-attention compute QKV - current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_attn_attn_b); + current = ctx0.op_mul_mat(&self.blocks[il].c_attn_attn_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.blocks[il].c_attn_attn_b, ¤t), + ¤t, + ); let nb = current.get_nb()[1]; let f32_size = std::mem::size_of::(); + let n_embd_head = embedding_length / head_count; let mut qcur = ctx0.op_cont(&ctx0.op_view_3d( ¤t, - (n_embd / n_head, n_head, n), - (nb / n_head, nb), + (n_embd_head, head_count, input_len), + (nb / head_count, nb), 0, )); let mut kcur = ctx0.op_cont(&ctx0.op_view_3d( ¤t, - (n_embd / n_head, n_head, n), - (nb / n_head, nb), - f32_size * n_embd / n_head, + (n_embd_head, head_count, input_len), + (nb / head_count, nb), + f32_size * n_embd_head, )); let mut vcur = ctx0.op_cont(&ctx0.op_view_3d( ¤t, - (n_embd / n_head, n_head, n), - (nb / n_head, nb), - 2 * f32_size * n_embd / n_head, + (n_embd_head, head_count, input_len), + (nb / head_count, nb), + 2 * f32_size * n_embd_head, )); // self-attention using mode = 2 for GPT-NeoX mode - let overrides = self.params.rope_overrides.as_ref(); - qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2, overrides); - kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2, overrides); + let overrides = params.rope_overrides.as_ref(); + qcur = ctx0.op_rope_inplace(&qcur, session_len, rope_dimension_count, 2, overrides); + kcur = ctx0.op_rope_inplace(&kcur, session_len, rope_dimension_count, 2, overrides); // store key and value to memory - vcur = ctx0.op_transpose(&ctx0.op_reshape_2d(&vcur, n_embd, n)); + vcur = ctx0.op_transpose(&ctx0.op_reshape_2d(&vcur, embedding_length, input_len)); let k = ctx0.op_view_1d( builder.memory_k, - n * n_embd, - (memory_k_size * n_embd) * (il * n_ctx + n_past), + input_len * embedding_length, + (memory_k_size * embedding_length) * (il * ctx_size + session_len), ); let v = ctx0.op_view_2d( builder.memory_v, - (n, n_embd), - n_ctx * memory_v_size, - (il * n_ctx) * memory_v_size * n_embd + n_past * memory_v_size, + (input_len, embedding_length), + ctx_size * memory_v_size, + (il * ctx_size) * memory_v_size * embedding_length + + session_len * memory_v_size, ); gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); @@ -252,12 +240,12 @@ impl KnownModel for GptNeoX { &ctx0.op_reshape_3d( &ctx0.op_view_1d( builder.memory_k, - (n_past + n) * n_embd, - il * n_ctx * memory_k_size * n_embd, + (session_len + input_len) * embedding_length, + il * ctx_size * memory_k_size * embedding_length, ), - n_embd / n_head, - n_head, - n_past + n, + n_embd_head, + head_count, + session_len + input_len, ), (0, 2, 1, 3), ); @@ -268,11 +256,11 @@ impl KnownModel for GptNeoX { // KQ_scaled = KQ / sqrt(n_embd/n_head) let KQ_scaled = ctx0.op_scale_inplace( &KQ, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + &ctx0.new_f32(1f32 / f32::sqrt(embedding_length as f32 / head_count as f32)), ); // KQ_masked = mask_past(KQ_scaled) - let KQ_masked = ctx0.op_diag_mask_inf_inplace(&KQ_scaled, n_past); + let KQ_masked = ctx0.op_diag_mask_inf_inplace(&KQ_scaled, session_len); // KQ = soft_max(KQ_masked) let KQ_softmax = ctx0.op_soft_max_inplace(&KQ_masked); @@ -280,12 +268,12 @@ impl KnownModel for GptNeoX { // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() let V = ctx0.op_view_3d( builder.memory_v, - (n_past + n, n_embd / n_head, n_head), + (session_len + input_len, n_embd_head, head_count), ( - n_ctx * memory_v_size, - n_ctx * memory_v_size * n_embd / n_head, + ctx_size * memory_v_size, + ctx_size * memory_v_size * n_embd_head, ), - il * n_ctx * memory_v_size * n_embd, + il * ctx_size * memory_v_size * embedding_length, ); // KQV = transpose(V) * KQ_soft_max @@ -294,16 +282,22 @@ impl KnownModel for GptNeoX { let KQV_merged = ctx0.op_permute(&KQV, (0, 2, 1, 3)); // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy(&KQV_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); + current = ctx0.op_cpy( + &KQV_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, embedding_length, input_len), + ); // self-attention projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); - current = ctx0.op_add(¤t, &self.layers[il].c_attn_proj_b); + current = ctx0.op_mul_mat(&self.blocks[il].c_attn_proj_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.blocks[il].c_attn_proj_b, ¤t), + ¤t, + ); let feedforward_input: Tensor; if !use_parallel_residual { feedforward_input = ctx0.op_add(¤t, &input_layer); - current = feed_forward_network(&ctx0, &self.layers[il], &feedforward_input); + current = feed_forward_network(&ctx0, &self.blocks[il], &feedforward_input); // input for next layer input_layer = ctx0.op_add(¤t, &feedforward_input); } else { @@ -312,7 +306,7 @@ impl KnownModel for GptNeoX { // this is independent of the self-attention result, so it could be done in parallel to the self-attention // note here we pass inpL instead of cur - current = feed_forward_network(&ctx0, &self.layers[il], &input_layer); + current = feed_forward_network(&ctx0, &self.blocks[il], &input_layer); // layer input + FF current = ctx0.op_add(¤t, &feedforward_input); @@ -325,7 +319,10 @@ impl KnownModel for GptNeoX { // normalize the output input_layer = ctx0.op_norm(&input_layer); // inpL = ln_f_g*inpL + ln_f_b - input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b); + input_layer = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), + &ctx0.op_repeat(&self.ln_f_b, &input_layer), + ); let embeddings_tensor: ggml::Tensor = input_layer.share(); @@ -338,37 +335,34 @@ impl KnownModel for GptNeoX { GraphOutputs { result: input_layer, embedding_result: embeddings_tensor, - output_length: n, + output_length: input_len, }, ) }); // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); + common::read_last_token( + session, + &outputs.result, + vocabulary_count, + outputs.output_length, + ); common::extract_logits( output_request, &outputs.result, - n_vocab, + vocabulary_count, outputs.output_length, ); common::extract_embeddings( output_request, &outputs.embedding_result, - n_embd, + embedding_length, outputs.output_length, ); } - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size + fn data(&self) -> &ModelData { + &self.data } fn bot_token_id(&self) -> Option { @@ -376,14 +370,14 @@ impl KnownModel for GptNeoX { } fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() + self.tokenizer().id("<|endoftext|>".as_bytes()).unwrap() } - fn quantize_tensors() -> Vec { + fn quantize_tensors(&self) -> Vec { vec![Regex::new(".*weight").unwrap()] } - fn skip_quantize_tensors() -> Vec { + fn skip_quantize_tensors(&self) -> Vec { vec![] } @@ -393,82 +387,44 @@ impl KnownModel for GptNeoX { } /// GPT-NeoX [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct Hyperparameters { - /// Size of the model's vocabulary - pub n_vocab: usize, - /// Size of the model's context - pub n_ctx: usize, /// Size of the model's embedding layer - pub n_embd: usize, + embedding_length: usize, /// n_head - pub n_head: usize, - /// Number of layers in the model - pub n_layer: usize, - /// n_rot - pub n_rot: usize, + head_count: usize, + /// Number of blocks in the model + block_count: usize, /// Whether to use a "parallel" formulation in each Transformer layer. /// This is on for most models, but is off for some e.g. RedPajama. - pub use_parallel_residual: bool, + use_parallel_residual: bool, + // RoPE dimension count + rope_dimension_count: usize, /// file_type - pub file_type: FileType, -} - -impl Default for Hyperparameters { - fn default() -> Self { - Self { - n_vocab: Default::default(), - n_ctx: Default::default(), - n_embd: Default::default(), - n_head: Default::default(), - n_layer: Default::default(), - n_rot: Default::default(), - file_type: Default::default(), - use_parallel_residual: true, - } - } + file_type: Option, + /// The tensor data layout that this model was encoded with + tensor_data_layout: String, } -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - Ok(Hyperparameters { - n_vocab: util::read_i32(reader)?.try_into()?, - n_ctx: util::read_i32(reader)?.try_into()?, - n_embd: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - n_rot: util::read_i32(reader)?.try_into()?, - use_parallel_residual: util::read_bool(reader)?, - file_type: util::read_filetype(reader)?, +impl Hyperparameters { + fn read(metadata: &Metadata) -> Result { + Ok(Self { + embedding_length: metadata.get_countable("gptneox.embedding_length")?, + head_count: metadata.get_countable("gptneox.attention.head_count")?, + block_count: metadata.get_countable("gptneox.block_count")?, + use_parallel_residual: metadata + .get_with_type("gptneox.use_parallel_residual", MetadataValue::as_bool)?, + rope_dimension_count: metadata.get_countable("gptneox.rope.dimension_count")?, + file_type: FileType::read_for_hyperparameters(metadata)?, + tensor_data_layout: metadata + .get_str("llama.tensor_data_layout") + .unwrap_or(META_TENSOR_DATA_LAYOUT) + .to_string(), }) } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_ctx.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.n_rot.try_into()?)?; - util::write_bool(writer, self.use_parallel_residual)?; - util::write_i32(writer, self.file_type.into())?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } } -struct Layer { +struct Block { // pre-normalization ln_1_g: Tensor, ln_1_b: Tensor, @@ -492,26 +448,29 @@ struct Layer { c_mlp_proj_b: Tensor, } -fn feed_forward_network(context: &ggml::Context, layer: &Layer, input: &Tensor) -> Tensor { +fn feed_forward_network(context: &ggml::Context, block: &Block, input: &Tensor) -> Tensor { let mut current = context.op_norm(input); - //gain and bias - current = context.op_add(&context.op_mul(¤t, &layer.ln_2_g), &layer.ln_2_b); + // gain and bias + current = context.op_add( + &context.op_mul(&context.op_repeat(&block.ln_2_g, ¤t), ¤t), + &context.op_repeat(&block.ln_2_b, ¤t), + ); // apply weights - current = context.op_mul_mat(&layer.c_mlp_fc_w, ¤t); + current = context.op_mul_mat(&block.c_mlp_fc_w, ¤t); // apply bias - current = context.op_add(¤t, &layer.c_mlp_fc_b); + current = context.op_add(&context.op_repeat(&block.c_mlp_fc_b, ¤t), ¤t); // GELU activation current = context.op_gelu(¤t); // projection // cur = proj_w*cur + proj_b - current = context.op_mul_mat(&layer.c_mlp_proj_w, ¤t); + current = context.op_mul_mat(&block.c_mlp_proj_w, ¤t); - current = context.op_add(¤t, &layer.c_mlp_proj_b); + current = context.op_add(&context.op_repeat(&block.c_mlp_proj_b, ¤t), ¤t); current } diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 69ab5aa8..b282ecdb 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -1,13 +1,14 @@ //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem. #![deny(missing_docs)] -use std::error::Error; - use llm_base::{ - ggml::{self}, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, + ggml::{ + self, + format::gguf::{Metadata, META_TENSOR_DATA_LAYOUT}, + }, + model::{common, HyperparametersReadError, ModelData, ModelLoadArgs, ModelLoadError}, + FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, ModelContext, + OutputRequest, Regex, TokenId, }; /// The LLaMA model. Ref: [Introducing LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) @@ -15,10 +16,8 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct Llama { - params: ModelParameters, + data: ModelData, hyperparameters: Hyperparameters, - tokenizer: Tokenizer, - _version: LlamaModelType, // model-global weights // weighted token embeddings wte: ggml::Tensor, @@ -28,7 +27,7 @@ pub struct Llama { output: ggml::Tensor, // weights for the model - layers: Vec, + blocks: Vec, // must be kept alive for the model context: ModelContext, @@ -37,94 +36,68 @@ pub struct Llama { unsafe impl Send for Llama {} unsafe impl Sync for Llama {} -impl KnownModel for Llama { - type Hyperparameters = Hyperparameters; +impl Model for Llama { + fn new(args: ModelLoadArgs) -> Result { + let hyperparameters = Hyperparameters::read(&args.gguf.metadata)?; + assert_eq!(hyperparameters.tensor_data_layout, META_TENSOR_DATA_LAYOUT); - fn new( - mut hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl TensorLoader, - ) -> Result { - let mut tl = tensor_loader; + let mut tl = args.tensor_loader; // model-global weights - let wte = tl.load("tok_embeddings.weight")?; + let wte = tl.load("token_embd.weight")?; + + let data = args.data; - let backend = params.backend(0); + let backend = data.params.backend(0); - let norm = tl.load("norm.weight")?.transfer_to(backend); + let norm = tl.load("output_norm.weight")?.transfer_to(backend); let output = tl.load("output.weight")?.transfer_to(backend); - let mut layers = Vec::new(); + let mut blocks = Vec::new(); - for i in 0..hyperparameters.n_layer { - let backend = params.backend(i); + for i in 0..hyperparameters.block_count { + let backend = data.params.backend(i); - let layer = Layer { - attention_norm: tl - .load(&format!("layers.{i}.attention_norm.weight"))? + let block = Block { + attn_n: tl + .load(&format!("blk.{i}.attn_norm.weight"))? .transfer_to(backend), - wq: tl - .load(&format!("layers.{i}.attention.wq.weight"))? + attn_q: tl + .load(&format!("blk.{i}.attn_q.weight"))? .transfer_to(backend), - wk: tl - .load(&format!("layers.{i}.attention.wk.weight"))? + attn_k: tl + .load(&format!("blk.{i}.attn_k.weight"))? .transfer_to(backend), - wv: tl - .load(&format!("layers.{i}.attention.wv.weight"))? + attn_v: tl + .load(&format!("blk.{i}.attn_v.weight"))? .transfer_to(backend), - wo: tl - .load(&format!("layers.{i}.attention.wo.weight"))? + attn_output: tl + .load(&format!("blk.{i}.attn_output.weight"))? .transfer_to(backend), ffn_norm: tl - .load(&format!("layers.{i}.ffn_norm.weight"))? + .load(&format!("blk.{i}.ffn_norm.weight"))? .transfer_to(backend), - w1: tl - .load(&format!("layers.{i}.feed_forward.w1.weight"))? + ffn_gate: tl + .load(&format!("blk.{i}.ffn_gate.weight"))? .transfer_to(backend), - w2: tl - .load(&format!("layers.{i}.feed_forward.w2.weight"))? + ffn_down: tl + .load(&format!("blk.{i}.ffn_down.weight"))? .transfer_to(backend), - w3: tl - .load(&format!("layers.{i}.feed_forward.w3.weight"))? + ffn_up: tl + .load(&format!("blk.{i}.ffn_up.weight"))? .transfer_to(backend), }; - layers.push(layer); + blocks.push(block); } let context = tl.finish(); - // TODO: read from file - let mut version = match hyperparameters.n_layer { - 26 => LlamaModelType::Model3b, - 32 => LlamaModelType::Model7b, - 40 => LlamaModelType::Model13b, - 60 => LlamaModelType::Model30b, - 80 => LlamaModelType::Model65b, - _ => LlamaModelType::Model7b, // anything < 32 - }; - // TODO: temporary fix for 70B models - if let Some(n_gqa) = params.n_gqa { - if hyperparameters.n_layer >= 80 { - assert_eq!( - hyperparameters.n_head % n_gqa, - 0, - "assuming 70B Llama2 model based on GQA == 8" - ); - hyperparameters.n_head_kv = hyperparameters.n_head / n_gqa; - version = LlamaModelType::Model70b; - } - } - Ok(Self { + data, hyperparameters, - params, - _version: version, - tokenizer, wte, norm, output, - layers, + blocks, context, }) } @@ -133,10 +106,10 @@ impl KnownModel for Llama { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, + &self.data.params, + self.hyperparameters.block_count, + self.hyperparameters.embedding_length, + self.tokenizer().len(), ) } @@ -147,34 +120,35 @@ impl KnownModel for Llama { input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { - let ctx_size = self.params.context_size; + let params = &self.data.params; + let ctx_size = params.context_size; + + let vocabulary_count = self.tokenizer().len(); let Hyperparameters { - n_vocab, - n_embd, - n_mult: _, - n_head, - n_head_kv, - n_layer, - n_rot, + embedding_length, + head_count, + head_count_kv, + block_count, file_type: _, + tensor_data_layout: _, } = self.hyperparameters; - let n_embd_gqa = n_embd / (n_head / n_head_kv); + + let embedding_length_gqa = + embedding_length / self.hyperparameters.grouped_query_attention(); let outputs = session.compute(self.context.clone(), input_tokens, |builder| { let session_len = builder.n_past; let input_len = builder.input_length(); - let mut ctx0 = builder.ctx0.borrow_mut(); - let embd = builder.embd; let mut input_layer = ctx0.op_get_rows(&self.wte, embd); let mut gf = ctx0.create_compute_graph(); - for il in 0..n_layer { - ctx0.set_offloading(self.params.should_offload(il)); + for il in 0..block_count { + ctx0.set_offloading(params.should_offload(il)); let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; @@ -183,21 +157,22 @@ impl KnownModel for Llama { current = ctx0.op_rms_norm(&input_layer); // cur = attention_norm * cur - current = ctx0.op_mul(¤t, &self.layers[il].attention_norm); + current = ctx0.op_mul(¤t, &self.blocks[il].attn_n); // self-attention // compute Q and K and RoPE them - let overrides = self.params.rope_overrides.as_ref(); + let overrides = params.rope_overrides.as_ref(); + let n_embd_head = embedding_length / head_count; let q_current = ctx0 .op_rope_inplace( &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].wq, ¤t), - n_embd / n_head, - n_head, + &ctx0.op_mul_mat(&self.blocks[il].attn_q, ¤t), + n_embd_head, + head_count, input_len, ), session_len, - n_rot, + n_embd_head, 0, overrides, ) @@ -205,37 +180,38 @@ impl KnownModel for Llama { let k_current = ctx0 .op_rope_inplace( &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), - n_embd / n_head, - n_head_kv, + &ctx0.op_mul_mat(&self.blocks[il].attn_k, ¤t), + n_embd_head, + head_count_kv, input_len, ), session_len, - n_rot, + n_embd_head, 0, overrides, ) .set_name("Kcur"); // store key and value to memory - // compute the transposed [N, n_embd] V matrix + // compute the transposed [N, embedding_length] V matrix let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d( - &ctx0.op_mul_mat(&self.layers[il].wv, ¤t), - n_embd_gqa, + &ctx0.op_mul_mat(&self.blocks[il].attn_v, ¤t), + embedding_length_gqa, input_len, )); let k = ctx0.op_view_1d( builder.memory_k, - input_len * n_embd_gqa, - (builder.memory_k.element_size() * n_embd_gqa) * (il * ctx_size + session_len), + input_len * embedding_length_gqa, + (builder.memory_k.element_size() * embedding_length_gqa) + * (il * ctx_size + session_len), ); let v = ctx0.op_view_2d( builder.memory_v, - (input_len, n_embd_gqa), + (input_len, embedding_length_gqa), ctx_size * builder.memory_v.element_size(), - (il * ctx_size) * builder.memory_v.element_size() * n_embd_gqa + (il * ctx_size) * builder.memory_v.element_size() * embedding_length_gqa + session_len * builder.memory_v.element_size(), ); @@ -250,11 +226,13 @@ impl KnownModel for Llama { &ctx0.op_reshape_3d( &ctx0.op_view_1d( builder.memory_k, - (session_len + input_len) * n_embd_gqa, - il * ctx_size * builder.memory_k.element_size() * n_embd_gqa, + (session_len + input_len) * embedding_length_gqa, + il * ctx_size + * builder.memory_k.element_size() + * embedding_length_gqa, ), - n_embd / n_head, - n_head_kv, + n_embd_head, + head_count_kv, session_len + input_len, ), (0, 2, 1, 3), @@ -264,10 +242,10 @@ impl KnownModel for Llama { // K * Q let k_q = ctx0.op_mul_mat(&k, &q).set_name("KQ"); - // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled = KQ / sqrt(embedding_length/head_count) let kq_scale = ctx0 - .new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())) - .set_name("1/sqrt(n_embd/n_head)"); + .new_f32(1.0 / ((embedding_length as f32 / head_count as f32).sqrt())) + .set_name("1/sqrt(embedding_length/head_count)"); let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale).set_name("KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) @@ -280,16 +258,21 @@ impl KnownModel for Llama { .op_soft_max_inplace(&k_q_masked) .set_name("KQ_soft_max"); - // split cached V into n_head heads + // split cached V into head_count heads let v = ctx0 .op_view_3d( builder.memory_v, - (session_len + input_len, n_embd / n_head, n_head_kv), + ( + session_len + input_len, + embedding_length / head_count, + head_count_kv, + ), ( ctx_size * builder.memory_v.element_size(), - ctx_size * builder.memory_v.element_size() * n_embd / n_head, + ctx_size * builder.memory_v.element_size() * embedding_length + / head_count, ), - il * ctx_size * builder.memory_v.element_size() * n_embd_gqa, + il * ctx_size * builder.memory_v.element_size() * embedding_length_gqa, ) .set_name("V"); @@ -298,16 +281,16 @@ impl KnownModel for Llama { // KQV_merged = KQV.permute(0, 2, 1, 3) let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)).set_name("KQV_merged"); - // cur = KQV_merged.contiguous().view(n_embd, N) + // cur = KQV_merged.contiguous().view(embedding_length, N) current = ctx0 .op_cpy( &k_q_v_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + &ctx0.new_tensor_2d(ggml::Type::F32, embedding_length, input_len), ) .set_name("KQV_merged_contiguous"); // projection (no bias) - current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); + current = ctx0.op_mul_mat(&self.blocks[il].attn_output, ¤t); let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); @@ -316,18 +299,18 @@ impl KnownModel for Llama { current = ctx0.op_rms_norm(&input_feed_forward); // cur = cur*ffn_norm(broadcasted) - current = ctx0.op_mul(¤t, &self.layers[il].ffn_norm); + current = ctx0.op_mul(¤t, &self.blocks[il].ffn_norm); - let tmp = ctx0.op_mul_mat(&self.layers[il].w3, ¤t); + let tmp = ctx0.op_mul_mat(&self.blocks[il].ffn_up, ¤t); - current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); + current = ctx0.op_mul_mat(&self.blocks[il].ffn_gate, ¤t); // SILU activation current = ctx0.op_silu(¤t); current = ctx0.op_mul(¤t, &tmp); - current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); + current = ctx0.op_mul_mat(&self.blocks[il].ffn_down, ¤t); current = ctx0.op_add(¤t, &input_feed_forward); @@ -358,31 +341,28 @@ impl KnownModel for Llama { }); // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); + common::read_last_token( + session, + &outputs.result, + vocabulary_count, + outputs.output_length, + ); common::extract_logits( output_request, &outputs.result, - n_vocab, + vocabulary_count, outputs.output_length, ); common::extract_embeddings( output_request, &outputs.embedding_result, - n_embd, + embedding_length, outputs.output_length, ); } - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size + fn data(&self) -> &ModelData { + &self.data } fn bot_token_id(&self) -> Option { @@ -390,14 +370,14 @@ impl KnownModel for Llama { } fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("".as_bytes()).unwrap_or(2) + self.tokenizer().id("".as_bytes()).unwrap_or(2) } - fn quantize_tensors() -> Vec { + fn quantize_tensors(&self) -> Vec { vec![Regex::new(".*weight").unwrap()] } - fn skip_quantize_tensors() -> Vec { + fn skip_quantize_tensors(&self) -> Vec { vec![] } @@ -406,99 +386,55 @@ impl KnownModel for Llama { } } -/// LLaMA [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct Hyperparameters { - /// Size of the model's vocabulary - pub n_vocab: usize, +#[derive(Debug, Default, PartialEq, Eq, Clone)] +struct Hyperparameters { /// Size of the model's embedding layer - pub n_embd: usize, - /// n_mult - pub n_mult: usize, - /// n_head - pub n_head: usize, - /// grouped-query attention - pub n_head_kv: usize, - /// Number of layers in the model - pub n_layer: usize, - /// n_rot - pub n_rot: usize, + embedding_length: usize, + /// The number of attention heads + head_count: usize, + /// The number of grouped-query attention heads + head_count_kv: usize, + /// Number of blocks in the model + block_count: usize, /// file_type - pub file_type: FileType, + file_type: Option, + /// The tensor data layout that this model was encoded with + tensor_data_layout: String, } - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - let n_vocab = util::read_i32(reader)?.try_into()?; - let n_embd = util::read_i32(reader)?.try_into()?; - let n_mult = util::read_i32(reader)?.try_into()?; - let n_head = util::read_i32(reader)?.try_into()?; - let n_layer = util::read_i32(reader)?.try_into()?; - let n_rot = util::read_i32(reader)?.try_into()?; - let file_type = util::read_filetype(reader)?; - - // Defaults to multi-head attention where n_head_kv == n_heads - let n_head_kv = n_head; - - Ok(Hyperparameters { - n_head, - n_head_kv, - n_vocab, - n_embd, - n_mult, - n_layer, - n_rot, - file_type, +impl Hyperparameters { + pub fn read(metadata: &Metadata) -> Result { + Ok(Self { + embedding_length: metadata.get_countable("llama.embedding_length")?, + head_count: metadata.get_countable("llama.attention.head_count")?, + head_count_kv: metadata.get_countable("llama.attention.head_count_kv")?, + block_count: metadata.get_countable("llama.block_count")?, + file_type: FileType::read_for_hyperparameters(metadata)?, + tensor_data_layout: metadata + .get_str("llama.tensor_data_layout") + .unwrap_or(META_TENSOR_DATA_LAYOUT) + .to_string(), }) } - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_mult.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.n_rot.try_into()?)?; - util::write_i32(writer, self.file_type.into())?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) + /// Returns the number of grouped-query attention heads. + fn grouped_query_attention(&self) -> usize { + self.head_count / self.head_count_kv } } -struct Layer { - attention_norm: ggml::Tensor, +struct Block { + attn_n: ggml::Tensor, - wq: ggml::Tensor, - wk: ggml::Tensor, - wv: ggml::Tensor, - wo: ggml::Tensor, + attn_q: ggml::Tensor, + attn_k: ggml::Tensor, + attn_v: ggml::Tensor, + attn_output: ggml::Tensor, // normalization ffn_norm: ggml::Tensor, // ff - w1: ggml::Tensor, - w2: ggml::Tensor, - w3: ggml::Tensor, -} - -/// Available Llama models -enum LlamaModelType { - Model3b, - Model7b, - Model13b, - Model30b, - Model65b, - Model70b, + ffn_gate: ggml::Tensor, + ffn_down: ggml::Tensor, + ffn_up: ggml::Tensor, } diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index 1e52d2d0..2685fcd1 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -1,371 +1,371 @@ -//! An implementation of [MPT](https://huggingface.co/mosaicml) for the `llm` ecosystem. -#![deny(missing_docs)] - -use ggml::Tensor; -use llm_base::{ - ggml::{self}, - model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, - ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, -}; - -/// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b) -/// -/// # Safety -/// This implements [Send] and [Sync] as it is immutable after construction. -pub struct Mpt { - params: ModelParameters, - - hyperparameters: Hyperparameters, - tokenizer: Tokenizer, - - // model-global weights - // weighted token embeddings - wte: Tensor, - // normalization - norm: Tensor, - - // weights for the model - layers: Vec, - - // must be kept alive for the model - context: ModelContext, -} - -unsafe impl Send for Mpt {} -unsafe impl Sync for Mpt {} - -impl KnownModel for Mpt { - type Hyperparameters = Hyperparameters; - - fn new( - hyperparameters: Self::Hyperparameters, - params: ModelParameters, - tokenizer: Tokenizer, - tensor_loader: impl llm_base::TensorLoader, - ) -> Result { - let mut tl = tensor_loader; - - // model-gobal weights - let wte = tl.load("transformer.wte.weight")?; - let norm = tl.load("transformer.norm_f.weight")?; - - let mut layers = Vec::new(); - for i in 0..hyperparameters.n_layer { - let layer = Layer { - norm_1_weight: tl.load(&format!("transformer.blocks.{i}.norm_1.weight"))?, - c_attn_wqkv_weight: tl.load(&format!("transformer.blocks.{i}.attn.Wqkv.weight"))?, - - c_attn_out_proj_weight: tl - .load(&format!("transformer.blocks.{i}.attn.out_proj.weight"))?, - norm_2_weight: tl.load(&format!("transformer.blocks.{i}.norm_2.weight"))?, - - ffn_up_proj: tl.load(&format!("transformer.blocks.{i}.ffn.up_proj.weight"))?, - ffn_down_proj: tl.load(&format!("transformer.blocks.{i}.ffn.down_proj.weight"))?, - }; - - layers.push(layer); - } - - let context = tl.finish(); - - Ok(Mpt { - hyperparameters, - params, - tokenizer, - wte, - norm, - layers, - context, - }) - } - - fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { - InferenceSession::new( - config, - &self.params, - self.hyperparameters.n_layer, - self.hyperparameters.n_embd, - self.hyperparameters.n_vocab, - ) - } - - fn evaluate( - &self, - session: &mut InferenceSession, - input_tokens: &[TokenId], - output_request: &mut OutputRequest, - ) { - let ctx_size = self.params.context_size; - - let Hyperparameters { - n_embd, - n_head, - n_vocab, - n_layer, - alibi_bias_max, - .. - } = self.hyperparameters; - - let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let n = builder.input_length(); - let session_len = builder.n_past; - let ctx0 = builder.ctx0.borrow(); - let (memory_k_size, memory_v_size) = ( - builder.memory_k.element_size(), - builder.memory_v.element_size(), - ); - let embd = builder.embd; - - let mut input_layer = ctx0.op_get_rows(&self.wte, embd); - - let f32_size = std::mem::size_of::(); - - let mut gf = ctx0.create_compute_graph(); - for il in 0..n_layer { - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_mul(¤t, &self.layers[il].norm_1_weight); - - current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, ¤t); - - let nb = current.get_nb()[1]; - let qcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, 0); - let kcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, f32_size * n_embd); - let vcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, f32_size * n_embd * 2); - - let k = ctx0.op_view_1d( - builder.memory_k, - n * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_1d( - builder.memory_v, - n * n_embd, - (memory_v_size * n_embd) * (il * ctx_size + session_len), - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - let q = ctx0.op_permute( - &ctx0.op_cpy( - &qcur, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n), - ), - (0, 2, 1, 3), - ); - - let bigk = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_k, - (session_len + n) * n_embd, - il * ctx_size * memory_k_size * n_embd, - ), - n_embd / n_head, - n_head, - session_len + n, - ), - (0, 2, 1, 3), - ); - - let kq = ctx0.op_mul_mat(&bigk, &q); - let kq_scaled = ctx0.op_scale( - &kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - let kq_scaled_alibi = - ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max); - let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len); - let kq_softmax = ctx0.op_soft_max(&kq_masked); - - let v_trans = ctx0.op_cpy( - &ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_v, - (session_len + n) * n_embd, - il * ctx_size * memory_v_size * n_embd, - ), - n_embd / n_head, - n_head, - session_len + n, - ), - (1, 2, 0, 3), - ), - &ctx0.new_tensor_3d( - builder.memory_v.get_type(), - session_len + n, - n_embd / n_head, - n_head, - ), - ); - - let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); - let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - - current = ctx0.op_cpy(&kqv_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); - // projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, ¤t); - - input_layer = ctx0.op_add(&input_layer, ¤t); - - current = ctx0.op_norm(&input_layer); - current = ctx0.op_mul(¤t, &self.layers[il].norm_2_weight); - - current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, ¤t); - - current = ctx0.op_gelu(¤t); - - // projection - current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, ¤t); - - input_layer = ctx0.op_add(&input_layer, ¤t); - } - - // norm - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_mul(&input_layer, &self.norm); - - let embeddings_tensor: ggml::Tensor = input_layer.share(); - - // output embedding weight tied to input embedding - input_layer = ctx0.op_mul_mat(&self.wte, &input_layer); - - ( - gf, - GraphOutputs { - result: input_layer, - embedding_result: embeddings_tensor, - output_length: n, - }, - ) - }); - - // finish evaluation - common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); - common::extract_logits( - output_request, - &outputs.result, - n_vocab, - outputs.output_length, - ); - common::extract_embeddings( - output_request, - &outputs.embedding_result, - n_embd, - outputs.output_length, - ); - } - - fn hyperparameters(&self) -> &Self::Hyperparameters { - &self.hyperparameters - } - - fn tokenizer(&self) -> &Tokenizer { - &self.tokenizer - } - - fn context_size(&self) -> usize { - self.params.context_size - } - - fn bot_token_id(&self) -> Option { - self.tokenizer.id("<|padding|>".as_bytes()) - } - - fn eot_token_id(&self) -> TokenId { - self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() - } - - fn quantize_tensors() -> Vec { - vec![Regex::new(".*weight").unwrap()] - } - - fn skip_quantize_tensors() -> Vec { - vec![] - } - - fn supports_rewind(&self) -> bool { - true - } -} - -/// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -#[derive(Debug, Default, PartialEq, Clone, Copy)] -pub struct Hyperparameters { - /// Size of the model's embedding layer - n_embd: usize, - /// Maximum sequence length - max_seq_len: usize, - /// n_heads - n_head: usize, - /// Number of layers in the model - n_layer: usize, - /// Size of the model's vocabulary - n_vocab: usize, - /// Alibi bias max - alibi_bias_max: f32, - /// Clip KQV - clip_kqv: f32, - /// file_type - file_type: FileType, -} -impl Eq for Hyperparameters {} - -impl llm_base::Hyperparameters for Hyperparameters { - fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { - let hyperparameters = Hyperparameters { - n_embd: util::read_i32(reader)?.try_into()?, - max_seq_len: util::read_i32(reader)?.try_into()?, - n_head: util::read_i32(reader)?.try_into()?, - n_layer: util::read_i32(reader)?.try_into()?, - n_vocab: util::read_i32(reader)?.try_into()?, - alibi_bias_max: util::read_f32(reader)?, - clip_kqv: util::read_f32(reader)?, - file_type: util::read_filetype(reader)?, - }; - - Ok(hyperparameters) - } - - fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { - util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.max_seq_len.try_into()?)?; - util::write_i32(writer, self.n_head.try_into()?)?; - util::write_i32(writer, self.n_layer.try_into()?)?; - util::write_i32(writer, self.n_vocab.try_into()?)?; - util::write_f32(writer, self.alibi_bias_max)?; - util::write_f32(writer, self.clip_kqv)?; - util::write_i32(writer, self.file_type.into())?; - Ok(()) - } - - fn n_vocabulary(&self) -> usize { - self.n_vocab - } - - fn file_type(&self) -> Option { - Some(self.file_type) - } - - fn file_type_mut(&mut self) -> Option<&mut FileType> { - Some(&mut self.file_type) - } -} - -struct Layer { - // pre normalization - norm_1_weight: Tensor, - - // attention - c_attn_wqkv_weight: Tensor, - c_attn_out_proj_weight: Tensor, - - // post normalization - norm_2_weight: Tensor, - - // ff - ffn_up_proj: Tensor, - ffn_down_proj: Tensor, -} +// //! An implementation of [MPT](https://huggingface.co/mosaicml) for the `llm` ecosystem. +// #![deny(missing_docs)] + +// use ggml::Tensor; +// use llm_base::{ +// ggml::{self}, +// model::{common, HyperparametersWriteError}, +// util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, Model, LoadError, +// ModelContext, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, +// }; + +// /// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b) +// /// +// /// # Safety +// /// This implements [Send] and [Sync] as it is immutable after construction. +// pub struct Mpt { +// params: ModelParameters, + +// hyperparameters: Hyperparameters, +// tokenizer: Tokenizer, + +// // model-global weights +// // weighted token embeddings +// wte: Tensor, +// // normalization +// norm: Tensor, + +// // weights for the model +// layers: Vec, + +// // must be kept alive for the model +// context: ModelContext, +// } + +// unsafe impl Send for Mpt {} +// unsafe impl Sync for Mpt {} + +// impl Model for Mpt { +// type Hyperparameters = Hyperparameters; + +// fn new( +// hyperparameters: Self::Hyperparameters, +// params: ModelParameters, +// tokenizer: Tokenizer, +// tensor_loader: impl llm_base::TensorLoader, +// ) -> Result { +// let mut tl = tensor_loader; + +// // model-gobal weights +// let wte = tl.load("transformer.wte.weight")?; +// let norm = tl.load("transformer.norm_f.weight")?; + +// let mut layers = Vec::new(); +// for i in 0..hyperparameters.n_layer { +// let layer = Layer { +// norm_1_weight: tl.load(&format!("transformer.blocks.{i}.norm_1.weight"))?, +// c_attn_wqkv_weight: tl.load(&format!("transformer.blocks.{i}.attn.Wqkv.weight"))?, + +// c_attn_out_proj_weight: tl +// .load(&format!("transformer.blocks.{i}.attn.out_proj.weight"))?, +// norm_2_weight: tl.load(&format!("transformer.blocks.{i}.norm_2.weight"))?, + +// ffn_up_proj: tl.load(&format!("transformer.blocks.{i}.ffn.up_proj.weight"))?, +// ffn_down_proj: tl.load(&format!("transformer.blocks.{i}.ffn.down_proj.weight"))?, +// }; + +// layers.push(layer); +// } + +// let context = tl.finish(); + +// Ok(Mpt { +// hyperparameters, +// params, +// tokenizer, +// wte, +// norm, +// layers, +// context, +// }) +// } + +// fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { +// InferenceSession::new( +// config, +// &self.params, +// self.hyperparameters.n_layer, +// self.hyperparameters.n_embd, +// self.hyperparameters.n_vocab, +// ) +// } + +// fn evaluate( +// &self, +// session: &mut InferenceSession, +// input_tokens: &[TokenId], +// output_request: &mut OutputRequest, +// ) { +// let ctx_size = self.params.context_size; + +// let Hyperparameters { +// n_embd, +// n_head, +// n_vocab, +// n_layer, +// alibi_bias_max, +// .. +// } = self.hyperparameters; + +// let outputs = session.compute(self.context.clone(), input_tokens, |builder| { +// let n = builder.input_length(); +// let session_len = builder.n_past; +// let ctx0 = builder.ctx0.borrow(); +// let (memory_k_size, memory_v_size) = ( +// builder.memory_k.element_size(), +// builder.memory_v.element_size(), +// ); +// let embd = builder.embd; + +// let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + +// let f32_size = std::mem::size_of::(); + +// let mut gf = ctx0.create_compute_graph(); +// for il in 0..n_layer { +// let mut current = ctx0.op_norm(&input_layer); +// current = ctx0.op_mul(¤t, &self.layers[il].norm_1_weight); + +// current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, ¤t); + +// let nb = current.get_nb()[1]; +// let qcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, 0); +// let kcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, f32_size * n_embd); +// let vcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, f32_size * n_embd * 2); + +// let k = ctx0.op_view_1d( +// builder.memory_k, +// n * n_embd, +// (memory_k_size * n_embd) * (il * ctx_size + session_len), +// ); +// let v = ctx0.op_view_1d( +// builder.memory_v, +// n * n_embd, +// (memory_v_size * n_embd) * (il * ctx_size + session_len), +// ); + +// gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); +// gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + +// let q = ctx0.op_permute( +// &ctx0.op_cpy( +// &qcur, +// &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n), +// ), +// (0, 2, 1, 3), +// ); + +// let bigk = ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_k, +// (session_len + n) * n_embd, +// il * ctx_size * memory_k_size * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + n, +// ), +// (0, 2, 1, 3), +// ); + +// let kq = ctx0.op_mul_mat(&bigk, &q); +// let kq_scaled = ctx0.op_scale( +// &kq, +// &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), +// ); +// let kq_scaled_alibi = +// ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max); +// let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len); +// let kq_softmax = ctx0.op_soft_max(&kq_masked); + +// let v_trans = ctx0.op_cpy( +// &ctx0.op_permute( +// &ctx0.op_reshape_3d( +// &ctx0.op_view_1d( +// builder.memory_v, +// (session_len + n) * n_embd, +// il * ctx_size * memory_v_size * n_embd, +// ), +// n_embd / n_head, +// n_head, +// session_len + n, +// ), +// (1, 2, 0, 3), +// ), +// &ctx0.new_tensor_3d( +// builder.memory_v.get_type(), +// session_len + n, +// n_embd / n_head, +// n_head, +// ), +// ); + +// let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); +// let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + +// current = ctx0.op_cpy(&kqv_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); +// // projection +// current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, ¤t); + +// input_layer = ctx0.op_add(&input_layer, ¤t); + +// current = ctx0.op_norm(&input_layer); +// current = ctx0.op_mul(¤t, &self.layers[il].norm_2_weight); + +// current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, ¤t); + +// current = ctx0.op_gelu(¤t); + +// // projection +// current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, ¤t); + +// input_layer = ctx0.op_add(&input_layer, ¤t); +// } + +// // norm +// input_layer = ctx0.op_norm(&input_layer); +// input_layer = ctx0.op_mul(&input_layer, &self.norm); + +// let embeddings_tensor: ggml::Tensor = input_layer.share(); + +// // output embedding weight tied to input embedding +// input_layer = ctx0.op_mul_mat(&self.wte, &input_layer); + +// ( +// gf, +// GraphOutputs { +// result: input_layer, +// embedding_result: embeddings_tensor, +// output_length: n, +// }, +// ) +// }); + +// // finish evaluation +// common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length); +// common::extract_logits( +// output_request, +// &outputs.result, +// n_vocab, +// outputs.output_length, +// ); +// common::extract_embeddings( +// output_request, +// &outputs.embedding_result, +// n_embd, +// outputs.output_length, +// ); +// } + +// fn hyperparameters(&self) -> &Self::Hyperparameters { +// &self.hyperparameters +// } + +// fn tokenizer(&self) -> &Tokenizer { +// &self.tokenizer +// } + +// fn context_size(&self) -> usize { +// self.params.context_size +// } + +// fn bot_token_id(&self) -> Option { +// self.tokenizer.id("<|padding|>".as_bytes()) +// } + +// fn eot_token_id(&self) -> TokenId { +// self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() +// } + +// fn quantize_tensors() -> Vec { +// vec![Regex::new(".*weight").unwrap()] +// } + +// fn skip_quantize_tensors() -> Vec { +// vec![] +// } + +// fn supports_rewind(&self) -> bool { +// true +// } +// } + +// /// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +// #[derive(Debug, Default, PartialEq, Clone, Copy)] +// pub struct Hyperparameters { +// /// Size of the model's embedding layer +// n_embd: usize, +// /// Maximum sequence length +// max_seq_len: usize, +// /// n_heads +// n_head: usize, +// /// Number of layers in the model +// n_layer: usize, +// /// Size of the model's vocabulary +// n_vocab: usize, +// /// Alibi bias max +// alibi_bias_max: f32, +// /// Clip KQV +// clip_kqv: f32, +// /// file_type +// file_type: FileType, +// } +// impl Eq for Hyperparameters {} + +// impl llm_base::Hyperparameters for Hyperparameters { +// fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { +// let hyperparameters = Hyperparameters { +// n_embd: util::read_i32(reader)?.try_into()?, +// max_seq_len: util::read_i32(reader)?.try_into()?, +// n_head: util::read_i32(reader)?.try_into()?, +// n_layer: util::read_i32(reader)?.try_into()?, +// n_vocab: util::read_i32(reader)?.try_into()?, +// alibi_bias_max: util::read_f32(reader)?, +// clip_kqv: util::read_f32(reader)?, +// file_type: util::read_filetype(reader)?, +// }; + +// Ok(hyperparameters) +// } + +// fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { +// util::write_i32(writer, self.n_embd.try_into()?)?; +// util::write_i32(writer, self.max_seq_len.try_into()?)?; +// util::write_i32(writer, self.n_head.try_into()?)?; +// util::write_i32(writer, self.n_layer.try_into()?)?; +// util::write_i32(writer, self.n_vocab.try_into()?)?; +// util::write_f32(writer, self.alibi_bias_max)?; +// util::write_f32(writer, self.clip_kqv)?; +// util::write_i32(writer, self.file_type.into())?; +// Ok(()) +// } + +// fn n_vocabulary(&self) -> usize { +// self.n_vocab +// } + +// fn file_type(&self) -> Option { +// Some(self.file_type) +// } + +// fn file_type_mut(&mut self) -> Option<&mut FileType> { +// Some(&mut self.file_type) +// } +// } + +// struct Layer { +// // pre normalization +// norm_1_weight: Tensor, + +// // attention +// c_attn_wqkv_weight: Tensor, +// c_attn_out_proj_weight: Tensor, + +// // post normalization +// norm_2_weight: Tensor, + +// // ff +// ffn_up_proj: Tensor, +// ffn_down_proj: Tensor, +// }