A local-first private AI assistant for everyday use. Runs on-device models with encrypted P2P sync, and supports sharing chats publicly on ATProto.
10
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge pull request #30 from tilesprivacy/feat/basic-linux-compatibility

v0.3.0 + Linux compatibility

authored by

Anandu Pavanan and committed by
GitHub
7ba3f4a0 d9178c64

+1405 -1233
+20
ATTRIBUTIONS.txt
··· 1 + This project includes code derived from third-party open-source projects. 2 + 3 + --- 4 + 5 + Project: mlx-knife 6 + Author: The BROKE team 🦫 7 + Source: https://github.com/mzau/mlx-knife 8 + License: MIT 9 + 10 + Description: 11 + Modules regarding mlx from mlx-knife has been used as our starting point and for further references 12 + 13 + 14 + Project: mem-agent-mcp 15 + Author: Dria 16 + Source: https://github.com/firstbatchxyz/mem-agent-mcp 17 + License: Apache-2.0 license 18 + 19 + Description: 20 + Modules regarding mem-agent cli from mem-agent-mcp has been used as our starting point and for further references
+154 -168
Cargo.lock
··· 34 34 35 35 [[package]] 36 36 name = "anstyle-query" 37 - version = "1.1.4" 37 + version = "1.1.5" 38 38 source = "registry+https://github.com/rust-lang/crates.io-index" 39 - checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" 39 + checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" 40 40 dependencies = [ 41 - "windows-sys 0.60.2", 41 + "windows-sys 0.61.2", 42 42 ] 43 43 44 44 [[package]] 45 45 name = "anstyle-wincon" 46 - version = "3.0.10" 46 + version = "3.0.11" 47 47 source = "registry+https://github.com/rust-lang/crates.io-index" 48 - checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" 48 + checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" 49 49 dependencies = [ 50 50 "anstyle", 51 51 "once_cell_polyfill", 52 - "windows-sys 0.60.2", 52 + "windows-sys 0.61.2", 53 53 ] 54 54 55 55 [[package]] ··· 78 78 79 79 [[package]] 80 80 name = "bumpalo" 81 - version = "3.19.0" 81 + version = "3.19.1" 82 82 source = "registry+https://github.com/rust-lang/crates.io-index" 83 - checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 83 + checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" 84 84 85 85 [[package]] 86 86 name = "bytes" 87 - version = "1.10.1" 87 + version = "1.11.0" 88 88 source = "registry+https://github.com/rust-lang/crates.io-index" 89 - checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" 89 + checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" 90 90 91 91 [[package]] 92 92 name = "cc" 93 - version = "1.2.41" 93 + version = "1.2.51" 94 94 source = "registry+https://github.com/rust-lang/crates.io-index" 95 - checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" 95 + checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" 96 96 dependencies = [ 97 97 "find-msvc-tools", 98 98 "shlex", ··· 106 106 107 107 [[package]] 108 108 name = "clap" 109 - version = "4.5.50" 109 + version = "4.5.54" 110 110 source = "registry+https://github.com/rust-lang/crates.io-index" 111 - checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" 111 + checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" 112 112 dependencies = [ 113 113 "clap_builder", 114 114 "clap_derive", ··· 116 116 117 117 [[package]] 118 118 name = "clap_builder" 119 - version = "4.5.50" 119 + version = "4.5.54" 120 120 source = "registry+https://github.com/rust-lang/crates.io-index" 121 - checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" 121 + checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" 122 122 dependencies = [ 123 123 "anstream", 124 124 "anstyle", ··· 210 210 211 211 [[package]] 212 212 name = "find-msvc-tools" 213 - version = "0.1.4" 213 + version = "0.1.6" 214 214 source = "registry+https://github.com/rust-lang/crates.io-index" 215 - checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" 215 + checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" 216 216 217 217 [[package]] 218 218 name = "fnv" ··· 350 350 351 351 [[package]] 352 352 name = "hashbrown" 353 - version = "0.16.0" 353 + version = "0.16.1" 354 354 source = "registry+https://github.com/rust-lang/crates.io-index" 355 - checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" 355 + checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" 356 356 357 357 [[package]] 358 358 name = "heck" ··· 362 362 363 363 [[package]] 364 364 name = "http" 365 - version = "1.3.1" 365 + version = "1.4.0" 366 366 source = "registry+https://github.com/rust-lang/crates.io-index" 367 - checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" 367 + checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" 368 368 dependencies = [ 369 369 "bytes", 370 - "fnv", 371 370 "itoa", 372 371 ] 373 372 ··· 402 401 403 402 [[package]] 404 403 name = "hyper" 405 - version = "1.7.0" 404 + version = "1.8.1" 406 405 source = "registry+https://github.com/rust-lang/crates.io-index" 407 - checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" 406 + checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" 408 407 dependencies = [ 409 408 "atomic-waker", 410 409 "bytes", ··· 456 455 457 456 [[package]] 458 457 name = "hyper-util" 459 - version = "0.1.17" 458 + version = "0.1.19" 460 459 source = "registry+https://github.com/rust-lang/crates.io-index" 461 - checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" 460 + checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" 462 461 dependencies = [ 463 462 "base64", 464 463 "bytes", ··· 482 481 483 482 [[package]] 484 483 name = "icu_collections" 485 - version = "2.0.0" 484 + version = "2.1.1" 486 485 source = "registry+https://github.com/rust-lang/crates.io-index" 487 - checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" 486 + checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" 488 487 dependencies = [ 489 488 "displaydoc", 490 489 "potential_utf", ··· 495 494 496 495 [[package]] 497 496 name = "icu_locale_core" 498 - version = "2.0.0" 497 + version = "2.1.1" 499 498 source = "registry+https://github.com/rust-lang/crates.io-index" 500 - checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" 499 + checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" 501 500 dependencies = [ 502 501 "displaydoc", 503 502 "litemap", ··· 508 507 509 508 [[package]] 510 509 name = "icu_normalizer" 511 - version = "2.0.0" 510 + version = "2.1.1" 512 511 source = "registry+https://github.com/rust-lang/crates.io-index" 513 - checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" 512 + checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" 514 513 dependencies = [ 515 - "displaydoc", 516 514 "icu_collections", 517 515 "icu_normalizer_data", 518 516 "icu_properties", ··· 523 521 524 522 [[package]] 525 523 name = "icu_normalizer_data" 526 - version = "2.0.0" 524 + version = "2.1.1" 527 525 source = "registry+https://github.com/rust-lang/crates.io-index" 528 - checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" 526 + checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" 529 527 530 528 [[package]] 531 529 name = "icu_properties" 532 - version = "2.0.1" 530 + version = "2.1.2" 533 531 source = "registry+https://github.com/rust-lang/crates.io-index" 534 - checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" 532 + checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" 535 533 dependencies = [ 536 - "displaydoc", 537 534 "icu_collections", 538 535 "icu_locale_core", 539 536 "icu_properties_data", 540 537 "icu_provider", 541 - "potential_utf", 542 538 "zerotrie", 543 539 "zerovec", 544 540 ] 545 541 546 542 [[package]] 547 543 name = "icu_properties_data" 548 - version = "2.0.1" 544 + version = "2.1.2" 549 545 source = "registry+https://github.com/rust-lang/crates.io-index" 550 - checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" 546 + checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" 551 547 552 548 [[package]] 553 549 name = "icu_provider" 554 - version = "2.0.0" 550 + version = "2.1.1" 555 551 source = "registry+https://github.com/rust-lang/crates.io-index" 556 - checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" 552 + checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" 557 553 dependencies = [ 558 554 "displaydoc", 559 555 "icu_locale_core", 560 - "stable_deref_trait", 561 - "tinystr", 562 556 "writeable", 563 557 "yoke", 564 558 "zerofrom", ··· 589 583 590 584 [[package]] 591 585 name = "indexmap" 592 - version = "2.12.0" 586 + version = "2.12.1" 593 587 source = "registry+https://github.com/rust-lang/crates.io-index" 594 - checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" 588 + checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" 595 589 dependencies = [ 596 590 "equivalent", 597 591 "hashbrown", ··· 605 599 606 600 [[package]] 607 601 name = "iri-string" 608 - version = "0.7.8" 602 + version = "0.7.10" 609 603 source = "registry+https://github.com/rust-lang/crates.io-index" 610 - checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" 604 + checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" 611 605 dependencies = [ 612 606 "memchr", 613 607 "serde", ··· 615 609 616 610 [[package]] 617 611 name = "is_terminal_polyfill" 618 - version = "1.70.1" 612 + version = "1.70.2" 619 613 source = "registry+https://github.com/rust-lang/crates.io-index" 620 - checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 614 + checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" 621 615 622 616 [[package]] 623 617 name = "itoa" 624 - version = "1.0.15" 618 + version = "1.0.17" 625 619 source = "registry+https://github.com/rust-lang/crates.io-index" 626 - checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 620 + checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" 627 621 628 622 [[package]] 629 623 name = "js-sys" 630 - version = "0.3.81" 624 + version = "0.3.83" 631 625 source = "registry+https://github.com/rust-lang/crates.io-index" 632 - checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" 626 + checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" 633 627 dependencies = [ 634 628 "once_cell", 635 629 "wasm-bindgen", ··· 637 631 638 632 [[package]] 639 633 name = "libc" 640 - version = "0.2.177" 634 + version = "0.2.179" 641 635 source = "registry+https://github.com/rust-lang/crates.io-index" 642 - checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" 636 + checksum = "c5a2d376baa530d1238d133232d15e239abad80d05838b4b59354e5268af431f" 643 637 644 638 [[package]] 645 639 name = "linux-raw-sys" ··· 649 643 650 644 [[package]] 651 645 name = "litemap" 652 - version = "0.8.0" 646 + version = "0.8.1" 653 647 source = "registry+https://github.com/rust-lang/crates.io-index" 654 - checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" 648 + checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" 655 649 656 650 [[package]] 657 651 name = "log" 658 - version = "0.4.28" 652 + version = "0.4.29" 659 653 source = "registry+https://github.com/rust-lang/crates.io-index" 660 - checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" 654 + checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" 661 655 662 656 [[package]] 663 657 name = "memchr" ··· 673 667 674 668 [[package]] 675 669 name = "mio" 676 - version = "1.1.0" 670 + version = "1.1.1" 677 671 source = "registry+https://github.com/rust-lang/crates.io-index" 678 - checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" 672 + checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" 679 673 dependencies = [ 680 674 "libc", 681 675 "wasi", ··· 716 710 717 711 [[package]] 718 712 name = "once_cell_polyfill" 719 - version = "1.70.1" 713 + version = "1.70.2" 720 714 source = "registry+https://github.com/rust-lang/crates.io-index" 721 - checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" 715 + checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" 722 716 723 717 [[package]] 724 718 name = "openssl" 725 - version = "0.10.74" 719 + version = "0.10.75" 726 720 source = "registry+https://github.com/rust-lang/crates.io-index" 727 - checksum = "24ad14dd45412269e1a30f52ad8f0664f0f4f4a89ee8fe28c3b3527021ebb654" 721 + checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" 728 722 dependencies = [ 729 723 "bitflags", 730 724 "cfg-if", ··· 754 748 755 749 [[package]] 756 750 name = "openssl-sys" 757 - version = "0.9.110" 751 + version = "0.9.111" 758 752 source = "registry+https://github.com/rust-lang/crates.io-index" 759 - checksum = "0a9f0075ba3c21b09f8e8b2026584b1d18d49388648f2fbbf3c97ea8deced8e2" 753 + checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" 760 754 dependencies = [ 761 755 "cc", 762 756 "libc", ··· 796 790 797 791 [[package]] 798 792 name = "potential_utf" 799 - version = "0.1.3" 793 + version = "0.1.4" 800 794 source = "registry+https://github.com/rust-lang/crates.io-index" 801 - checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" 795 + checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" 802 796 dependencies = [ 803 797 "zerovec", 804 798 ] 805 799 806 800 [[package]] 807 801 name = "proc-macro2" 808 - version = "1.0.101" 802 + version = "1.0.104" 809 803 source = "registry+https://github.com/rust-lang/crates.io-index" 810 - checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" 804 + checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" 811 805 dependencies = [ 812 806 "unicode-ident", 813 807 ] 814 808 815 809 [[package]] 816 810 name = "quote" 817 - version = "1.0.41" 811 + version = "1.0.42" 818 812 source = "registry+https://github.com/rust-lang/crates.io-index" 819 - checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" 813 + checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" 820 814 dependencies = [ 821 815 "proc-macro2", 822 816 ] ··· 829 823 830 824 [[package]] 831 825 name = "reqwest" 832 - version = "0.12.24" 826 + version = "0.12.28" 833 827 source = "registry+https://github.com/rust-lang/crates.io-index" 834 - checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" 828 + checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" 835 829 dependencies = [ 836 830 "base64", 837 831 "bytes", ··· 887 881 888 882 [[package]] 889 883 name = "rustix" 890 - version = "1.1.2" 884 + version = "1.1.3" 891 885 source = "registry+https://github.com/rust-lang/crates.io-index" 892 - checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" 886 + checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" 893 887 dependencies = [ 894 888 "bitflags", 895 889 "errno", ··· 900 894 901 895 [[package]] 902 896 name = "rustls" 903 - version = "0.23.33" 897 + version = "0.23.35" 904 898 source = "registry+https://github.com/rust-lang/crates.io-index" 905 - checksum = "751e04a496ca00bb97a5e043158d23d66b5aabf2e1d5aa2a0aaebb1aafe6f82c" 899 + checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" 906 900 dependencies = [ 907 901 "once_cell", 908 902 "rustls-pki-types", ··· 913 907 914 908 [[package]] 915 909 name = "rustls-pki-types" 916 - version = "1.12.0" 910 + version = "1.13.2" 917 911 source = "registry+https://github.com/rust-lang/crates.io-index" 918 - checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" 912 + checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" 919 913 dependencies = [ 920 914 "zeroize", 921 915 ] 922 916 923 917 [[package]] 924 918 name = "rustls-webpki" 925 - version = "0.103.7" 919 + version = "0.103.8" 926 920 source = "registry+https://github.com/rust-lang/crates.io-index" 927 - checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" 921 + checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" 928 922 dependencies = [ 929 923 "ring", 930 924 "rustls-pki-types", ··· 939 933 940 934 [[package]] 941 935 name = "ryu" 942 - version = "1.0.20" 936 + version = "1.0.22" 943 937 source = "registry+https://github.com/rust-lang/crates.io-index" 944 - checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 938 + checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" 945 939 946 940 [[package]] 947 941 name = "schannel" ··· 1007 1001 1008 1002 [[package]] 1009 1003 name = "serde_json" 1010 - version = "1.0.145" 1004 + version = "1.0.148" 1011 1005 source = "registry+https://github.com/rust-lang/crates.io-index" 1012 - checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 1006 + checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da" 1013 1007 dependencies = [ 1014 1008 "itoa", 1015 1009 "memchr", 1016 - "ryu", 1017 1010 "serde", 1018 1011 "serde_core", 1012 + "zmij", 1019 1013 ] 1020 1014 1021 1015 [[package]] ··· 1078 1072 1079 1073 [[package]] 1080 1074 name = "syn" 1081 - version = "2.0.107" 1075 + version = "2.0.113" 1082 1076 source = "registry+https://github.com/rust-lang/crates.io-index" 1083 - checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" 1077 + checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4" 1084 1078 dependencies = [ 1085 1079 "proc-macro2", 1086 1080 "quote", ··· 1130 1124 1131 1125 [[package]] 1132 1126 name = "tempfile" 1133 - version = "3.23.0" 1127 + version = "3.24.0" 1134 1128 source = "registry+https://github.com/rust-lang/crates.io-index" 1135 - checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" 1129 + checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" 1136 1130 dependencies = [ 1137 1131 "fastrand", 1138 1132 "getrandom 0.3.4", ··· 1142 1136 ] 1143 1137 1144 1138 [[package]] 1139 + name = "tilekit" 1140 + version = "0.1.0" 1141 + dependencies = [ 1142 + "nom", 1143 + ] 1144 + 1145 + [[package]] 1145 1146 name = "tiles" 1146 - version = "0.2.0" 1147 + version = "0.3.0" 1147 1148 dependencies = [ 1148 1149 "anyhow", 1149 1150 "clap", 1150 1151 "futures-util", 1151 - "nom", 1152 1152 "owo-colors", 1153 1153 "reqwest", 1154 1154 "serde", 1155 1155 "serde_json", 1156 + "tilekit", 1156 1157 "tokio", 1157 1158 ] 1158 1159 1159 1160 [[package]] 1160 1161 name = "tinystr" 1161 - version = "0.8.1" 1162 + version = "0.8.2" 1162 1163 source = "registry+https://github.com/rust-lang/crates.io-index" 1163 - checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" 1164 + checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" 1164 1165 dependencies = [ 1165 1166 "displaydoc", 1166 1167 "zerovec", ··· 1168 1169 1169 1170 [[package]] 1170 1171 name = "tokio" 1171 - version = "1.48.0" 1172 + version = "1.49.0" 1172 1173 source = "registry+https://github.com/rust-lang/crates.io-index" 1173 - checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" 1174 + checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" 1174 1175 dependencies = [ 1175 1176 "bytes", 1176 1177 "libc", ··· 1214 1215 1215 1216 [[package]] 1216 1217 name = "tokio-util" 1217 - version = "0.7.16" 1218 + version = "0.7.18" 1218 1219 source = "registry+https://github.com/rust-lang/crates.io-index" 1219 - checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" 1220 + checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" 1220 1221 dependencies = [ 1221 1222 "bytes", 1222 1223 "futures-core", ··· 1242 1243 1243 1244 [[package]] 1244 1245 name = "tower-http" 1245 - version = "0.6.6" 1246 + version = "0.6.8" 1246 1247 source = "registry+https://github.com/rust-lang/crates.io-index" 1247 - checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" 1248 + checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" 1248 1249 dependencies = [ 1249 1250 "bitflags", 1250 1251 "bytes", ··· 1272 1273 1273 1274 [[package]] 1274 1275 name = "tracing" 1275 - version = "0.1.41" 1276 + version = "0.1.44" 1276 1277 source = "registry+https://github.com/rust-lang/crates.io-index" 1277 - checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" 1278 + checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" 1278 1279 dependencies = [ 1279 1280 "pin-project-lite", 1280 1281 "tracing-core", ··· 1282 1283 1283 1284 [[package]] 1284 1285 name = "tracing-core" 1285 - version = "0.1.34" 1286 + version = "0.1.36" 1286 1287 source = "registry+https://github.com/rust-lang/crates.io-index" 1287 - checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" 1288 + checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" 1288 1289 dependencies = [ 1289 1290 "once_cell", 1290 1291 ] ··· 1297 1298 1298 1299 [[package]] 1299 1300 name = "unicode-ident" 1300 - version = "1.0.19" 1301 + version = "1.0.22" 1301 1302 source = "registry+https://github.com/rust-lang/crates.io-index" 1302 - checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" 1303 + checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" 1303 1304 1304 1305 [[package]] 1305 1306 name = "untrusted" ··· 1363 1364 1364 1365 [[package]] 1365 1366 name = "wasm-bindgen" 1366 - version = "0.2.104" 1367 + version = "0.2.106" 1367 1368 source = "registry+https://github.com/rust-lang/crates.io-index" 1368 - checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" 1369 + checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" 1369 1370 dependencies = [ 1370 1371 "cfg-if", 1371 1372 "once_cell", ··· 1375 1376 ] 1376 1377 1377 1378 [[package]] 1378 - name = "wasm-bindgen-backend" 1379 - version = "0.2.104" 1380 - source = "registry+https://github.com/rust-lang/crates.io-index" 1381 - checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" 1382 - dependencies = [ 1383 - "bumpalo", 1384 - "log", 1385 - "proc-macro2", 1386 - "quote", 1387 - "syn", 1388 - "wasm-bindgen-shared", 1389 - ] 1390 - 1391 - [[package]] 1392 1379 name = "wasm-bindgen-futures" 1393 - version = "0.4.54" 1380 + version = "0.4.56" 1394 1381 source = "registry+https://github.com/rust-lang/crates.io-index" 1395 - checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" 1382 + checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" 1396 1383 dependencies = [ 1397 1384 "cfg-if", 1398 1385 "js-sys", ··· 1403 1390 1404 1391 [[package]] 1405 1392 name = "wasm-bindgen-macro" 1406 - version = "0.2.104" 1393 + version = "0.2.106" 1407 1394 source = "registry+https://github.com/rust-lang/crates.io-index" 1408 - checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" 1395 + checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" 1409 1396 dependencies = [ 1410 1397 "quote", 1411 1398 "wasm-bindgen-macro-support", ··· 1413 1400 1414 1401 [[package]] 1415 1402 name = "wasm-bindgen-macro-support" 1416 - version = "0.2.104" 1403 + version = "0.2.106" 1417 1404 source = "registry+https://github.com/rust-lang/crates.io-index" 1418 - checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" 1405 + checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" 1419 1406 dependencies = [ 1407 + "bumpalo", 1420 1408 "proc-macro2", 1421 1409 "quote", 1422 1410 "syn", 1423 - "wasm-bindgen-backend", 1424 1411 "wasm-bindgen-shared", 1425 1412 ] 1426 1413 1427 1414 [[package]] 1428 1415 name = "wasm-bindgen-shared" 1429 - version = "0.2.104" 1416 + version = "0.2.106" 1430 1417 source = "registry+https://github.com/rust-lang/crates.io-index" 1431 - checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" 1418 + checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" 1432 1419 dependencies = [ 1433 1420 "unicode-ident", 1434 1421 ] ··· 1448 1435 1449 1436 [[package]] 1450 1437 name = "web-sys" 1451 - version = "0.3.81" 1438 + version = "0.3.83" 1452 1439 source = "registry+https://github.com/rust-lang/crates.io-index" 1453 - checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" 1440 + checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" 1454 1441 dependencies = [ 1455 1442 "js-sys", 1456 1443 "wasm-bindgen", ··· 1458 1445 1459 1446 [[package]] 1460 1447 name = "windows-link" 1461 - version = "0.1.3" 1462 - source = "registry+https://github.com/rust-lang/crates.io-index" 1463 - checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" 1464 - 1465 - [[package]] 1466 - name = "windows-link" 1467 1448 version = "0.2.1" 1468 1449 source = "registry+https://github.com/rust-lang/crates.io-index" 1469 1450 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 1470 1451 1471 1452 [[package]] 1472 1453 name = "windows-registry" 1473 - version = "0.5.3" 1454 + version = "0.6.1" 1474 1455 source = "registry+https://github.com/rust-lang/crates.io-index" 1475 - checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" 1456 + checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" 1476 1457 dependencies = [ 1477 - "windows-link 0.1.3", 1458 + "windows-link", 1478 1459 "windows-result", 1479 1460 "windows-strings", 1480 1461 ] 1481 1462 1482 1463 [[package]] 1483 1464 name = "windows-result" 1484 - version = "0.3.4" 1465 + version = "0.4.1" 1485 1466 source = "registry+https://github.com/rust-lang/crates.io-index" 1486 - checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" 1467 + checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" 1487 1468 dependencies = [ 1488 - "windows-link 0.1.3", 1469 + "windows-link", 1489 1470 ] 1490 1471 1491 1472 [[package]] 1492 1473 name = "windows-strings" 1493 - version = "0.4.2" 1474 + version = "0.5.1" 1494 1475 source = "registry+https://github.com/rust-lang/crates.io-index" 1495 - checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" 1476 + checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" 1496 1477 dependencies = [ 1497 - "windows-link 0.1.3", 1478 + "windows-link", 1498 1479 ] 1499 1480 1500 1481 [[package]] ··· 1521 1502 source = "registry+https://github.com/rust-lang/crates.io-index" 1522 1503 checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" 1523 1504 dependencies = [ 1524 - "windows-link 0.2.1", 1505 + "windows-link", 1525 1506 ] 1526 1507 1527 1508 [[package]] ··· 1546 1527 source = "registry+https://github.com/rust-lang/crates.io-index" 1547 1528 checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" 1548 1529 dependencies = [ 1549 - "windows-link 0.2.1", 1530 + "windows-link", 1550 1531 "windows_aarch64_gnullvm 0.53.1", 1551 1532 "windows_aarch64_msvc 0.53.1", 1552 1533 "windows_i686_gnu 0.53.1", ··· 1661 1642 1662 1643 [[package]] 1663 1644 name = "writeable" 1664 - version = "0.6.1" 1645 + version = "0.6.2" 1665 1646 source = "registry+https://github.com/rust-lang/crates.io-index" 1666 - checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" 1647 + checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" 1667 1648 1668 1649 [[package]] 1669 1650 name = "yoke" 1670 - version = "0.8.0" 1651 + version = "0.8.1" 1671 1652 source = "registry+https://github.com/rust-lang/crates.io-index" 1672 - checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" 1653 + checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" 1673 1654 dependencies = [ 1674 - "serde", 1675 1655 "stable_deref_trait", 1676 1656 "yoke-derive", 1677 1657 "zerofrom", ··· 1679 1659 1680 1660 [[package]] 1681 1661 name = "yoke-derive" 1682 - version = "0.8.0" 1662 + version = "0.8.1" 1683 1663 source = "registry+https://github.com/rust-lang/crates.io-index" 1684 - checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" 1664 + checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" 1685 1665 dependencies = [ 1686 1666 "proc-macro2", 1687 1667 "quote", ··· 1718 1698 1719 1699 [[package]] 1720 1700 name = "zerotrie" 1721 - version = "0.2.2" 1701 + version = "0.2.3" 1722 1702 source = "registry+https://github.com/rust-lang/crates.io-index" 1723 - checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" 1703 + checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" 1724 1704 dependencies = [ 1725 1705 "displaydoc", 1726 1706 "yoke", ··· 1729 1709 1730 1710 [[package]] 1731 1711 name = "zerovec" 1732 - version = "0.11.4" 1712 + version = "0.11.5" 1733 1713 source = "registry+https://github.com/rust-lang/crates.io-index" 1734 - checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" 1714 + checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" 1735 1715 dependencies = [ 1736 1716 "yoke", 1737 1717 "zerofrom", ··· 1740 1720 1741 1721 [[package]] 1742 1722 name = "zerovec-derive" 1743 - version = "0.11.1" 1723 + version = "0.11.2" 1744 1724 source = "registry+https://github.com/rust-lang/crates.io-index" 1745 - checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" 1725 + checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" 1746 1726 dependencies = [ 1747 1727 "proc-macro2", 1748 1728 "quote", 1749 1729 "syn", 1750 1730 ] 1731 + 1732 + [[package]] 1733 + name = "zmij" 1734 + version = "1.0.10" 1735 + source = "registry+https://github.com/rust-lang/crates.io-index" 1736 + checksum = "30e0d8dffbae3d840f64bda38e28391faef673a7b5a6017840f2a106c8145868"
+6 -14
Cargo.toml
··· 1 - [package] 2 - name = "tiles" 3 - version = "0.2.0" 4 - edition = "2024" 1 + [workspace] 2 + resolver = "3" 3 + members = [ 4 + "tilekit", 5 + "tiles", 6 + ] 5 7 6 - [dependencies] 7 - clap = { version = "4.5.48", features = ["derive"] } 8 - nom = "8" 9 - reqwest = { version = "0.12", features = ["json", "blocking", "stream"] } 10 - serde = { version = "1.0", features = ["derive"] } 11 - serde_json = "1.0" 12 - anyhow = "1.0" 13 - tokio = { version = "1" , features = ["macros", "rt-multi-thread"]} 14 - owo-colors = "4" 15 - futures-util = "0.3"
+6
HACKING.md
··· 30 30 ``` 31 31 32 32 2. In another terminal, run the Rust CLI using Cargo as usual. 33 + 34 + ```sh 35 + cd tiles 36 + 37 + cargo run 38 + ```
fixtures/a.modelfile tilekit/fixtures/a.modelfile
fixtures/llama_bad.Modelfile tilekit/fixtures/llama_bad.Modelfile
fixtures/mistral.modelfile tilekit/fixtures/mistral.modelfile
+2 -1
justfile
··· 12 12 cargo test 13 13 14 14 serve: 15 - uv run --project server python -m server.main 15 + server/.venv/bin/python3 -m server.main 16 + # uv run --project server python -m server.main 16 17 17 18 bundle: 18 19 ./scripts/bundler.sh
+1 -1
scripts/bundler.sh
··· 22 22 rm -rf "${DIST_DIR}/tmp/server/.venv" 23 23 24 24 echo "📦 Creating ${OUT_NAME}.tar.gz..." 25 - tar -czf "${DIST_DIR}/${OUT_NAME}.tar.gz" -C "${DIST_DIR}/tmp" . 25 + tar --exclude-from=scripts/tar.exclude -czf "${DIST_DIR}/${OUT_NAME}.tar.gz" -C "${DIST_DIR}/tmp" . 26 26 27 27 rm -rf "${DIST_DIR}/tmp" 28 28
+5 -5
scripts/install.sh
··· 1 1 #!/usr/bin/env bash 2 2 set -euo pipefail 3 3 4 - ENV="prod" # prod is another env, try taking it from github env 5 - REPO="tilesprivacy/tilekit" 4 + ENV="dev" # prod is another env, try taking it from github env 5 + REPO="tilesprivacy/tiles" 6 6 # VERSION="${TILES_VERSION:-latest}" 7 - VERSION="0.2.0" 7 + VERSION="0.3.0" 8 8 INSTALL_DIR="$HOME/.local/bin" # CLI install location 9 9 SERVER_DIR="$HOME/.local/share/tiles/server" # Python server folder 10 10 TMPDIR="$(mktemp -d)" ··· 43 43 TAR_URL="https://github.com/${REPO}/releases/download/${VERSION}/tiles-v${VERSION}-${ARCH}-${OS}.tar.gz" 44 44 curl -fsSL -o "${TMPDIR}/tiles.tar.gz" "$TAR_URL" 45 45 else 46 - # Installer suppose to ran from tilekit root folder after running the bundler 46 + # Installer suppose to ran from tiles root folder after running the bundler 47 47 mv "dist/tiles-v${VERSION}-${ARCH}-${OS}.tar.gz" "${TMPDIR}/tiles.tar.gz" 48 48 fi 49 49 ··· 71 71 log "Installing Python 3.13 via Homebrew..." 72 72 brew install python@3.13 || err "Failed to install Python 3.13" 73 73 else 74 - err "Python 3.13 is required but not found. Please install it manually." 74 + err "Python 3.13 is required but not found. Please install it manuallyv and retry installing tiles" 75 75 fi 76 76 fi 77 77
+8
scripts/tar.exclude
··· 1 + __pycache__ 2 + *.pyc 3 + *.pyo 4 + .venv 5 + .env 6 + .git 7 + .DS_Store 8 +
+1
server/.gitignore
··· 1 1 __pycache__/ 2 2 *.egg-info/ 3 3 .venv/ 4 + backend/__pycache__
+27 -414
server/api.py
··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 - 23 1 from fastapi import FastAPI, HTTPException 2 + 3 + from .schemas import ChatMessage, ChatCompletionRequest, StartRequest, downloadRequest 24 4 from .config import SYSTEM_PROMPT 25 5 import logging 26 - import json 27 - import time 28 - import uuid 29 - from collections.abc import AsyncGenerator 30 - from typing import Any, Dict, List, Optional, Union 6 + import sys 7 + from typing import Optional 31 8 32 9 from fastapi.responses import StreamingResponse 33 10 from pydantic import BaseModel, Field 34 11 35 - from .cache_utils import ( 36 - get_model_path 37 - ) 38 12 from .hf_downloader import pull_model 39 13 40 - from .mlx_runner import MLXRunner 41 - 42 - from server.mem_agent.utils import extract_python_code, extract_reply, extract_thoughts, create_memory_if_not_exists, format_results 14 + from server.mem_agent.utils import ( 15 + create_memory_if_not_exists, 16 + format_results, 17 + ) 43 18 from server.mem_agent.engine import execute_sandboxed_code 44 - # Global model cache and configuration 19 + 20 + from . import runtime 45 21 46 22 logger = logging.getLogger("app") 47 - _model_cache: Dict[str, MLXRunner] = {} 48 23 _current_model_path: Optional[str] = None 49 24 _default_max_tokens: Optional[int] = None # Use dynamic model-aware limits by default 50 - _runner: MLXRunner = {} 51 - _max_tool_turns = 5 52 25 _memory_path = "" 53 26 54 - class CompletionRequest(BaseModel): 55 - model: str 56 - prompt: Union[str, List[str]] 57 - max_tokens: Optional[int] = None 58 - temperature: Optional[float] = 0.7 59 - top_p: Optional[float] = 0.9 60 - stream: Optional[bool] = False 61 - stop: Optional[Union[str, List[str]]] = None 62 - repetition_penalty: Optional[float] = 1.1 27 + _messages: list[ChatMessage] = [] 63 28 64 29 65 - class ChatMessage(BaseModel): 66 - role: str = Field(..., pattern="^(system|user|assistant)$") 67 - content: str 68 - 69 - _messages: list[ChatMessage]= [] 70 - 71 - class ChatCompletionRequest(BaseModel): 72 - model: str 73 - messages: List[ChatMessage] 74 - chat_start: bool 75 - python_code: str 76 - max_tokens: Optional[int] = None 77 - temperature: Optional[float] = 0.7 78 - top_p: Optional[float] = 0.9 79 - stream: Optional[bool] = False 80 - stop: Optional[Union[str, List[str]]] = None 81 - repetition_penalty: Optional[float] = 1.1 82 - 83 - 84 - class CompletionResponse(BaseModel): 85 - id: str 86 - object: str = "text_completion" 87 - created: int 88 - model: str 89 - choices: List[Dict[str, Any]] 90 - usage: Dict[str, int] 91 - 92 - 93 - class ChatCompletionResponse(BaseModel): 94 - id: str 95 - object: str = "chat.completion" 96 - created: int 97 - model: str 98 - choices: List[Dict[str, Any]] 99 - # usage: Dict[str, int] 100 - 101 - 102 - class ModelInfo(BaseModel): 103 - id: str 104 - object: str = "model" 105 - owned_by: str = "mlx-knife" 106 - permission: List = [] 107 - context_length: Optional[int] = None 108 - 109 - class StartRequest(BaseModel): 110 - model: str 111 - memory_path: str 112 - 113 - class downloadRequest(BaseModel): 114 - model: str 115 - 116 - class Agent: 117 - def __init__( 118 - self, 119 - max_tool_turns: int = 20, 120 - memory_path: str = None, 121 - use_vllm: bool = False, 122 - model: str = None, 123 - predetermined_memory_path: bool = False, 124 - model_cache: Dict[str, MLXRunner] = {}, 125 - current_model_path: Optional[str] = None, 126 - default_max_tokens: Optional[int] = None # Use dynamic model-aware limits by default 127 - 128 - ): 129 - # Load the system prompt and add it to the conversation history 130 - self.system_prompt = SYSTEM_PROMPT 131 - self.messages: list[ChatMessage] = [ 132 - ChatMessage(role="system", content=self.system_prompt) 133 - ] 134 - 135 - # Set the maximum number of tool turns and use_vllm flag 136 - self.max_tool_turns = max_tool_turns 137 - self.use_vllm = use_vllm 138 - 139 30 app = FastAPI() 140 31 141 - agent: Agent() 142 - 143 - def get_or_load_model(model_spec: str, verbose: bool = False) -> MLXRunner: 144 - """Get model from cache or load it if not cached.""" 145 - global _model_cache, _current_model_path 146 - 147 - # Use the existing model path resolution from cache_utils 148 - 149 - try: 150 - model_path, model_name, commit_hash = get_model_path(model_spec) 151 - if not model_path.exists(): 152 - logger.info(f"Model {model_spec} not found in cache") 153 - raise HTTPException(status_code=404, detail=f"Model {model_spec} not found in cache") 154 - except Exception as e: 155 - logger.info(f"Model {model_spec} not found in: {str(e)}") 156 - raise HTTPException(status_code=404, detail=f"Model {model_spec} not found: {str(e)}") 157 - 158 - # Check if it's an MLX model 159 - 160 - model_path_str = str(model_path) 161 - 162 - # Check if we need to load a different model 163 - if _current_model_path != model_path_str: 164 - # Proactively clean up any previously loaded runner to release memory 165 - if _model_cache: 166 - try: 167 - for _old_runner in list(_model_cache.values()): 168 - try: 169 - _old_runner.cleanup() 170 - except Exception: 171 - pass 172 - finally: 173 - _model_cache.clear() 174 - 175 - # Load new model 176 - if verbose: 177 - print(f"Loading model: {model_name}") 178 - 179 - logger.info(f"Loading model: {model_name}") 180 - runner = MLXRunner(model_path_str, verbose=verbose) 181 - runner.load_model() 182 - 183 - _model_cache[model_path_str] = runner 184 - _current_model_path = model_path_str 185 - else: 186 - logger.info(f"Model {model_name} already in memory") 187 - 188 - return _model_cache[model_path_str] 189 - 190 - def format_chat_messages_for_runner(messages: List[ChatMessage]) -> List[Dict[str, str]]: 191 - """Convert chat messages to format expected by MLXRunner. 192 - 193 - Returns messages in dict format for the runner to apply chat templates. 194 - """ 195 - return [{"role": msg.role, "content": msg.content} for msg in messages] 196 - 197 - 198 - def count_tokens(text: str) -> int: 199 - """Rough token count estimation.""" 200 - return int(len(text.split()) * 1.3) # Approximation, convert to int 201 32 202 33 @app.get("/ping") 203 34 async def ping(): 204 - return {"message": "Badda-Bing Badda-Bang"} 35 + return {"message": "Badda-Bing Badda-Bang"} 36 + 205 37 206 38 @app.post("/download") 207 - async def download(request:downloadRequest): 208 - """ Download the model """ 209 - try: 210 - if pull_model(request.model): 211 - return {"message": "Model downloaded"} 212 - else: 213 - raise HTTPException(status_code=400, detail="Downloading model failed") 214 - except Exception as e: 215 - raise HTTPException(status_code=500, detail=str(e)) 39 + async def download(request: downloadRequest): 40 + """Download the model""" 41 + runtime.backend.download_model(request.model) 216 42 217 43 @app.post("/start") 218 44 async def start_model(request: StartRequest): 219 45 """Load the model and start the agent""" 220 - global _messages, _runner,_memory_path 46 + global _messages, _runner, _memory_path 221 47 222 48 _messages = [ChatMessage(role="system", content=SYSTEM_PROMPT)] 223 49 _memory_path = request.memory_path 50 + logger.info(f"{runtime.backend}") 51 + runtime.backend.get_or_load_model(request.model) 52 + return {"message": "Model loaded"} 224 53 225 - _runner = get_or_load_model(request.model) 226 - return {"message": "Model loaded"} 227 54 228 55 @app.post("/v1/chat/completions") 229 56 async def create_chat_completion(request: ChatCompletionRequest): 230 57 """Create a chat completion.""" 231 - global _messages, _max_tool_turns, _memory_path 58 + global _messages, _memory_path 232 59 try: 233 - runner = get_or_load_model(request.model) 234 60 235 61 if request.stream: 236 62 result = ({}, "") 237 63 if request.python_code: 238 - create_memory_if_not_exists() 239 64 result = execute_sandboxed_code( 240 65 code=request.python_code, 241 66 allowed_path=_memory_path, 242 67 import_module="server.mem_agent.tools", 243 68 ) 244 69 245 - _messages.append(ChatMessage(role="user", content=format_results(result[0], result[1]))) 246 - 70 + _messages.append( 71 + ChatMessage(role="user", content=format_results(result[0], result[1])) 72 + ) 73 + 247 74 # Streaming response 248 75 return StreamingResponse( 249 - generate_chat_stream(runner, request.messages, request), 76 + runtime.backend.generate_chat_stream(_messages, request), 250 77 media_type="text/plain", 251 - headers={"Cache-Control": "no-cache"} 252 - ) 253 - else: 254 - # Non-streaming response 255 - completion_id = f"chatcmpl-{uuid.uuid4()}" 256 - created = int(time.time()) 257 - 258 - # Convert messages to dict format for runner 259 - # _messages.append(system_message) 260 - if request.chat_start: 261 - _messages.extend(request.messages) 262 - message_dicts = format_chat_messages_for_runner(_messages) 263 - # Let the runner format with chat templates 264 - prompt = runner._format_conversation(message_dicts, use_chat_template=True) 265 - 266 - generated_text = runner.generate_batch( 267 - prompt=prompt, 268 - max_tokens=runner.get_effective_max_tokens(request.max_tokens or _default_max_tokens, interactive=False), 269 - temperature=request.temperature, 270 - top_p=request.top_p, 271 - repetition_penalty=request.repetition_penalty, 272 - use_chat_template=False # Already applied in _format_conversation 273 - ) 274 - 275 - # Token counting 276 - total_prompt = "\n\n".join([msg.content for msg in request.messages]) 277 - prompt_tokens = count_tokens(total_prompt) 278 - completion_tokens = count_tokens(generated_text) 279 - 280 - logger.info(f"prompt_token\n{prompt_tokens}") 281 - logger.info(f"completion_tokens\n{completion_tokens}") 282 - 283 - thoughts = extract_thoughts(generated_text) 284 - reply = extract_reply(generated_text) 285 - python_code = extract_python_code(generated_text) 286 - 287 - result = ({}, "") 288 - if python_code: 289 - create_memory_if_not_exists() 290 - result = execute_sandboxed_code( 291 - code=python_code, 292 - allowed_path=_memory_path, 293 - import_module="server.mem_agent.tools", 294 - ) 295 - 296 - logger.info(f"Model thoughts\n{thoughts}") 297 - logger.info(f"Model reply\n{reply}") 298 - logger.info(f"Model python\n{python_code}") 299 - logger.info(f"executed python result\n{str(result)}") 300 - 301 - # while remaining_tool_turns > 0 and not reply: 302 - # logger.info(f"Turn count\n{remaining_tool_turns}") 303 - _messages.append(ChatMessage(role="user", content=format_results(result[0], result[1]))) 304 - message_dicts = format_chat_messages_for_runner(_messages) 305 - # # Let the runner format with chat templates 306 - # prompt = runner._format_conversation(message_dicts, use_chat_template=True) 307 - # generated_text = runner.generate_batch( 308 - # prompt=prompt 309 - # ) 310 - 311 - # total_prompt = "\n\n".join([msg.content for msg in _messages]) 312 - # prompt_tokens = count_tokens(total_prompt) 313 - # completion_tokens = count_tokens(generated_text) 314 - 315 - # logger.info(f"prompt_token\n{prompt_tokens}") 316 - # logger.info(f"completion_tokens\n{completion_tokens}") 317 - 318 - # # print(generated_text) 319 - # # Extract the thoughts, reply and python code from the response 320 - # thoughts = extract_thoughts(generated_text) 321 - # reply = extract_reply(generated_text) 322 - # python_code = extract_python_code(generated_text) 323 - 324 - # logger.info(f"Model thoughts\n{thoughts}") 325 - # logger.info(f"Model reply\n{reply}") 326 - # logger.info(f"Model python\n{python_code}") 327 - 328 - # _messages.append(ChatMessage(role="assistant", content=generated_text)) 329 - # if python_code: 330 - # create_memory_if_not_exists() 331 - # result = execute_sandboxed_code( 332 - # code=python_code, 333 - # allowed_path=_memory_path, 334 - # import_module="server.mem_agent.tools", 335 - # ) 336 - # logger.info(f"executed python result\n{str(result)}") 337 - # else: 338 - # # Reset result when no Python code is executed 339 - # result = ({}, "") 340 - # logger.info(f"executed python result\n{str(result)}") 341 - # remaining_tool_turns -= 1 342 - 343 - return ChatCompletionResponse( 344 - id=completion_id, 345 - created=created, 346 - model=request.model, 347 - choices=[ 348 - { 349 - "index": 0, 350 - "message": { 351 - "role": "assistant", 352 - "content": generated_text 353 - }, 354 - "finish_reason": "stop" 355 - } 356 - ], 357 - # usage={ 358 - # "prompt_tokens": prompt_tokens, 359 - # "completion_tokens": completion_tokens, 360 - # "total_tokens": prompt_tokens + completion_tokens 361 - # } 78 + headers={"Cache-Control": "no-cache"}, 362 79 ) 363 80 except Exception as e: 364 81 raise HTTPException(status_code=500, detail=str(e)) 365 - 366 - async def generate_chat_stream( 367 - runner: MLXRunner, 368 - messages: List[ChatMessage], 369 - request: ChatCompletionRequest 370 - ) -> AsyncGenerator[str, None]: 371 - """Generate streaming chat completion response.""" 372 - 373 - global _messages 374 - completion_id = f"chatcmpl-{uuid.uuid4()}" 375 - created = int(time.time()) 376 - 377 - if request.chat_start: 378 - _messages.extend(request.messages) 379 - # Convert messages to dict format for runner 380 - message_dicts = format_chat_messages_for_runner(_messages) 381 - 382 - # Let the runner format with chat templates 383 - prompt = runner._format_conversation(message_dicts, use_chat_template=True) 384 - 385 - # Yield initial response 386 - initial_response = { 387 - "id": completion_id, 388 - "object": "chat.completion.chunk", 389 - "created": created, 390 - "model": request.model, 391 - "choices": [ 392 - { 393 - "index": 0, 394 - "delta": {"role": "assistant", "content": ""}, 395 - "finish_reason": None 396 - } 397 - ] 398 - } 399 - 400 - yield f"data: {json.dumps(initial_response)}\n\n" 401 - 402 - # Stream tokens 403 - try: 404 - for token in runner.generate_streaming( 405 - prompt=prompt, 406 - max_tokens=runner.get_effective_max_tokens(request.max_tokens or _default_max_tokens, interactive=False), 407 - temperature=request.temperature, 408 - top_p=request.top_p, 409 - repetition_penalty=request.repetition_penalty, 410 - use_chat_template=False, # Already applied in _format_conversation 411 - use_chat_stop_tokens=False # Server mode shouldn't stop on chat markers 412 - ): 413 - chunk_response = { 414 - "id": completion_id, 415 - "object": "chat.completion.chunk", 416 - "created": created, 417 - "model": request.model, 418 - "choices": [ 419 - { 420 - "index": 0, 421 - "delta": {"content": token}, 422 - "finish_reason": None 423 - } 424 - ] 425 - } 426 - 427 - yield f"data: {json.dumps(chunk_response)}\n\n" 428 - 429 - # Check for stop sequences 430 - if request.stop: 431 - stop_sequences = request.stop if isinstance(request.stop, list) else [request.stop] 432 - if any(stop in token for stop in stop_sequences): 433 - break 434 - 435 - except Exception as e: 436 - error_response = { 437 - "id": completion_id, 438 - "object": "chat.completion.chunk", 439 - "created": created, 440 - "model": request.model, 441 - "choices": [ 442 - { 443 - "index": 0, 444 - "delta": {}, 445 - "finish_reason": "error" 446 - } 447 - ], 448 - "error": str(e) 449 - } 450 - yield f"data: {json.dumps(error_response)}\n\n" 451 - 452 - # Final response 453 - final_response = { 454 - "id": completion_id, 455 - "object": "chat.completion.chunk", 456 - "created": created, 457 - "model": request.model, 458 - "choices": [ 459 - { 460 - "index": 0, 461 - "delta": {}, 462 - "finish_reason": "stop" 463 - } 464 - ] 465 - } 466 - 467 - yield f"data: {json.dumps(final_response)}\n\n" 468 - yield "data: [DONE]\n\n"
+1
server/backend/linux.py
··· 1 + # Module for linux backend
+183
server/backend/mlx.py
··· 1 + from .mlx_runner import MLXRunner 2 + from ..cache_utils import get_model_path 3 + from fastapi import HTTPException 4 + from ..schemas import ChatMessage, ChatCompletionRequest, downloadRequest 5 + from ..hf_downloader import pull_model 6 + 7 + import logging 8 + import json 9 + import time 10 + import uuid 11 + from collections.abc import AsyncGenerator 12 + 13 + logger = logging.getLogger("app") 14 + 15 + from typing import Any, Dict, List, Optional, Union 16 + 17 + _model_cache: Dict[str, MLXRunner] = {} 18 + _default_max_tokens: Optional[int] = None # Use dynamic model-aware limits by default 19 + _current_model_path: Optional[str] = None 20 + 21 + 22 + def download_model(model_name: str): 23 + """Download the model""" 24 + if pull_model(model_name): 25 + return {"message": "Model downloaded"} 26 + else: 27 + raise HTTPException(status_code=400, detail="Downloading model failed") 28 + 29 + 30 + def get_or_load_model(model_spec: str, verbose: bool = False) -> MLXRunner: 31 + """Get model from cache or load it if not cached.""" 32 + global _model_cache, _current_model_path 33 + 34 + # Use the existing model path resolution from cache_utils 35 + 36 + try: 37 + model_path, model_name, commit_hash = get_model_path(model_spec) 38 + if not model_path.exists(): 39 + logger.info(f"Model {model_spec} not found in cache") 40 + raise HTTPException( 41 + status_code=404, detail=f"Model {model_spec} not found in cache" 42 + ) 43 + except Exception as e: 44 + logger.info(f"Model {model_spec} not found in: {str(e)}") 45 + raise HTTPException( 46 + status_code=404, detail=f"Model {model_spec} not found: {str(e)}" 47 + ) 48 + 49 + # Check if it's an MLX model 50 + 51 + model_path_str = str(model_path) 52 + 53 + # Check if we need to load a different model 54 + if _current_model_path != model_path_str: 55 + # Proactively clean up any previously loaded runner to release memory 56 + if _model_cache: 57 + try: 58 + for _old_runner in list(_model_cache.values()): 59 + try: 60 + _old_runner.cleanup() 61 + except Exception: 62 + pass 63 + finally: 64 + _model_cache.clear() 65 + 66 + # Load new model 67 + if verbose: 68 + print(f"Loading model: {model_name}") 69 + 70 + logger.info(f"Loading model: {model_name}") 71 + runner = MLXRunner(model_path_str, verbose=verbose) 72 + runner.load_model() 73 + 74 + _model_cache[model_path_str] = runner 75 + _current_model_path = model_path_str 76 + else: 77 + logger.info(f"Model {model_name} already in memory") 78 + 79 + return _model_cache[model_path_str] 80 + 81 + async def generate_chat_stream( 82 + messages: List[ChatMessage], request: ChatCompletionRequest 83 + ) -> AsyncGenerator[str, None]: 84 + """Generate streaming chat completion response.""" 85 + 86 + _messages = messages 87 + completion_id = f"chatcmpl-{uuid.uuid4()}" 88 + created = int(time.time()) 89 + runner = get_or_load_model(request.model) 90 + if request.chat_start: 91 + _messages.extend(request.messages) 92 + # Convert messages to dict format for runner 93 + message_dicts = format_chat_messages_for_runner(_messages) 94 + 95 + # Let the runner format with chat templates 96 + prompt = runner._format_conversation(message_dicts, use_chat_template=True) 97 + 98 + # Yield initial response 99 + initial_response = { 100 + "id": completion_id, 101 + "object": "chat.completion.chunk", 102 + "created": created, 103 + "model": request.model, 104 + "choices": [ 105 + { 106 + "index": 0, 107 + "delta": {"role": "assistant", "content": ""}, 108 + "finish_reason": None, 109 + } 110 + ], 111 + } 112 + 113 + yield f"data: {json.dumps(initial_response)}\n\n" 114 + 115 + # Stream tokens 116 + try: 117 + for token in runner.generate_streaming( 118 + prompt=prompt, 119 + max_tokens=runner.get_effective_max_tokens( 120 + request.max_tokens or _default_max_tokens, interactive=False 121 + ), 122 + temperature=request.temperature, 123 + top_p=request.top_p, 124 + repetition_penalty=request.repetition_penalty, 125 + use_chat_template=False, # Already applied in _format_conversation 126 + use_chat_stop_tokens=False, # Server mode shouldn't stop on chat markers 127 + ): 128 + chunk_response = { 129 + "id": completion_id, 130 + "object": "chat.completion.chunk", 131 + "created": created, 132 + "model": request.model, 133 + "choices": [ 134 + {"index": 0, "delta": {"content": token}, "finish_reason": None} 135 + ], 136 + } 137 + 138 + yield f"data: {json.dumps(chunk_response)}\n\n" 139 + 140 + # Check for stop sequences 141 + if request.stop: 142 + stop_sequences = ( 143 + request.stop if isinstance(request.stop, list) else [request.stop] 144 + ) 145 + if any(stop in token for stop in stop_sequences): 146 + break 147 + 148 + except Exception as e: 149 + error_response = { 150 + "id": completion_id, 151 + "object": "chat.completion.chunk", 152 + "created": created, 153 + "model": request.model, 154 + "choices": [{"index": 0, "delta": {}, "finish_reason": "error"}], 155 + "error": str(e), 156 + } 157 + yield f"data: {json.dumps(error_response)}\n\n" 158 + 159 + # Final response 160 + final_response = { 161 + "id": completion_id, 162 + "object": "chat.completion.chunk", 163 + "created": created, 164 + "model": request.model, 165 + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], 166 + } 167 + 168 + yield f"data: {json.dumps(final_response)}\n\n" 169 + yield "data: [DONE]\n\n" 170 + 171 + def format_chat_messages_for_runner( 172 + messages: List[ChatMessage], 173 + ) -> List[Dict[str, str]]: 174 + """Convert chat messages to format expected by MLXRunner. 175 + 176 + Returns messages in dict format for the runner to apply chat templates. 177 + """ 178 + return [{"role": msg.role, "content": msg.content} for msg in messages] 179 + 180 + 181 + def count_tokens(text: str) -> int: 182 + """Rough token count estimation.""" 183 + return int(len(text.split()) * 1.3) # Approximation, convert to int
+205 -109
server/cache_utils.py
··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 1 import datetime 23 2 import json 24 3 import os ··· 56 35 else: 57 36 return f"models--{hf_name}" 58 37 38 + 59 39 def cache_dir_to_hf(cache_name: str) -> str: 60 40 if cache_name.startswith("models--"): 61 - remaining = cache_name[len("models--"):] 41 + remaining = cache_name[len("models--") :] 62 42 if "--" in remaining: 63 43 parts = remaining.split("--", 1) 64 44 return f"{parts[0]}/{parts[1]}" ··· 66 46 return remaining 67 47 return cache_name 68 48 49 + 69 50 def expand_model_name(model_name): 70 51 if "/" in model_name: 71 52 return model_name ··· 74 55 if mlx_cache_dir.exists(): 75 56 return mlx_candidate 76 57 common_mlx_patterns = [ 77 - "Llama-", "Qwen", "Mistral", "Phi-", "Mixtral", "phi-", "deepseek" 58 + "Llama-", 59 + "Qwen", 60 + "Mistral", 61 + "Phi-", 62 + "Mixtral", 63 + "phi-", 64 + "deepseek", 78 65 ] 79 66 for pattern in common_mlx_patterns: 80 67 if pattern in model_name: 81 68 return f"mlx-community/{model_name}" 82 69 return model_name 83 70 71 + 84 72 def find_matching_models(pattern): 85 73 """Find models that match a partial pattern. Returns a list of (model_dir, hf_name) tuples.""" 86 74 all_models = [d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--")] 87 75 matches = [] 88 - 76 + 89 77 for model_dir in all_models: 90 78 hf_name = cache_dir_to_hf(model_dir.name) 91 79 # Check if the pattern appears in the model name (case insensitive) 92 80 if pattern.lower() in hf_name.lower(): 93 81 matches.append((model_dir, hf_name)) 94 - 82 + 95 83 return matches 96 84 85 + 97 86 def hash_exists_in_local_cache(model_name, commit_hash): 98 87 """Check if a specific commit hash exists in the local cache for a model. 99 - 88 + 100 89 Supports both full hashes and short hash prefixes (local resolution only). 101 - 90 + 102 91 Args: 103 92 model_name: Full model name (e.g., 'mlx-community/Phi-3-mini-4k-instruct-4bit') 104 93 commit_hash: Commit hash to check for (short or full) 105 - 94 + 106 95 Returns: 107 96 Full hash if exists in local cache, None otherwise 108 97 """ 109 98 base_cache_dir = MODEL_CACHE / hf_to_cache_dir(model_name) 110 99 if not base_cache_dir.exists(): 111 100 return None 112 - 101 + 113 102 snapshots_dir = base_cache_dir / "snapshots" 114 103 if not snapshots_dir.exists(): 115 104 return None 116 - 105 + 117 106 # Check for exact match first (full hash) 118 107 hash_dir = snapshots_dir / commit_hash 119 108 if hash_dir.exists(): 120 109 return commit_hash 121 - 110 + 122 111 # Check for short hash match (local resolution) 123 112 if len(commit_hash) < 40: 124 113 for snapshot_dir in snapshots_dir.iterdir(): 125 114 if snapshot_dir.is_dir() and snapshot_dir.name.startswith(commit_hash): 126 115 return snapshot_dir.name # Return full hash 127 - 116 + 128 117 return None 118 + 129 119 130 120 def resolve_single_model(model_spec): 131 121 """ ··· 135 125 """ 136 126 # Parse the model spec (handles @commit_hash syntax) 137 127 model_name, commit_hash = parse_model_spec(model_spec) 138 - 128 + 139 129 # Try exact match first 140 130 base_cache_dir = MODEL_CACHE / hf_to_cache_dir(model_name) 141 131 if base_cache_dir.exists(): 142 132 return get_model_path(model_spec) 143 - 133 + 144 134 # Extract the base name (without @commit_hash) for fuzzy matching 145 - base_spec = model_spec.split('@')[0] if '@' in model_spec else model_spec 146 - 135 + base_spec = model_spec.split("@")[0] if "@" in model_spec else model_spec 136 + 147 137 # Try fuzzy matching 148 138 matches = find_matching_models(base_spec) 149 - 139 + 150 140 if not matches: 151 141 print(f"No models found matching '{base_spec}'!") 152 142 return None, None, None ··· 165 155 if resolved_hash: 166 156 resolved_spec = f"{hf_name}@{resolved_hash}" 167 157 return get_model_path(resolved_spec) 168 - 158 + 169 159 # Hash not found in any candidate model 170 160 print(f"Hash '{commit_hash}' not found in any model matching '{base_spec}'") 171 161 print("Available models:") ··· 178 168 for _, hf_name in sorted(matches, key=lambda x: x[1]): 179 169 print(f" {hf_name}") 180 170 return None, None, None 171 + 181 172 182 173 def get_model_path(model_spec): 183 174 model_name, commit_hash = parse_model_spec(model_spec) ··· 198 189 return latest, model_name, latest.name 199 190 # Return base_cache_dir for corrupted models so rm_model can handle them 200 191 return base_cache_dir, model_name, commit_hash 192 + 201 193 202 194 def parse_model_spec(model_spec): 203 195 if "@" in model_spec: ··· 207 199 model_name = expand_model_name(model_spec) 208 200 return model_name, None 209 201 202 + 210 203 def get_model_size(model_path): 211 204 if not model_path.exists(): 212 205 return "?" ··· 220 213 return f"{total_size / 1_000_000:.1f} MB" 221 214 else: 222 215 return f"{total_size / 1_000:.1f} KB" 216 + 223 217 224 218 def get_model_modified(model_path): 225 219 if not model_path.exists(): ··· 237 231 minutes = diff.seconds // 60 238 232 return f"{minutes} minutes ago" 239 233 234 + 240 235 def detect_framework(model_path, hf_name): 241 236 """Detect model framework with lenient hints (Issue #31).""" 242 237 # 1) org hint ··· 246 241 # 2) README front matter: tags contains 'mlx' OR library_name == 'mlx' 247 242 try: 248 243 tags, pipeline, lib = read_readme_front_matter(Path(model_path)) 249 - if (lib and lib.lower() == "mlx") or (tags and any((t or '').lower() == "mlx" for t in tags)): 244 + if (lib and lib.lower() == "mlx") or ( 245 + tags and any((t or "").lower() == "mlx" for t in tags) 246 + ): 250 247 return "MLX" 251 248 except Exception: 252 249 pass ··· 261 258 has_config = any(snapshots_dir.glob("*/*.json")) 262 259 total_size = get_model_size(Path(model_path)) 263 260 try: 264 - size_mb = float(total_size.replace(" GB", "000").replace(" MB", "").replace(" KB", "0").replace(" ", "")) 261 + size_mb = float( 262 + total_size.replace(" GB", "000") 263 + .replace(" MB", "") 264 + .replace(" KB", "0") 265 + .replace(" ", "") 266 + ) 265 267 except Exception: 266 268 size_mb = 0 267 269 if has_gguf: ··· 286 288 try: 287 289 tags, pipeline, _ = read_readme_front_matter(Path(model_path)) 288 290 tset = {t.lower() for t in (tags or [])} 289 - if pipeline == "text-generation" or any(k in tset for k in {"chat", "instruct"}): 291 + if pipeline == "text-generation" or any( 292 + k in tset for k in {"chat", "instruct"} 293 + ): 290 294 return "chat" 291 - if pipeline == "sentence-similarity" or any(k in tset for k in {"embedding", "embeddings"}): 295 + if pipeline == "sentence-similarity" or any( 296 + k in tset for k in {"embedding", "embeddings"} 297 + ): 292 298 return "embedding" 293 299 except Exception: 294 300 pass ··· 314 320 except Exception: 315 321 return None 316 322 323 + 317 324 def get_model_hash(model_path): 318 325 snapshots_dir = model_path / "snapshots" 319 326 if not snapshots_dir.exists(): ··· 323 330 return "--------" 324 331 latest = max(snapshots, key=lambda x: x.stat().st_mtime) 325 332 return latest.name[:8] 333 + 326 334 327 335 def is_model_healthy(model_spec): 328 336 """Strict health check for 1.x (backport of #27 rules). ··· 361 369 # 2) Fail fast on partial/tmp markers anywhere in the snapshot 362 370 for p in model_path.rglob("*"): 363 371 name = p.name.lower() 364 - if ".partial" in name or name.endswith(".partial") or name.endswith(".tmp") or "partial" in name: 372 + if ( 373 + ".partial" in name 374 + or name.endswith(".partial") 375 + or name.endswith(".tmp") 376 + or "partial" in name 377 + ): 365 378 return False 366 379 367 380 # Helper: detect Git LFS pointer file ··· 414 427 # 4) No index present — detect multi-shard pattern 415 428 # If pattern shards exist, require index (unhealthy without index by policy parity with 2.0) 416 429 import re 430 + 417 431 shard_re = re.compile(r"model-([0-9]{5})-of-([0-9]{5})\.(safetensors|bin)") 418 432 pattern_files = [] 419 433 for f in model_path.glob("*"): ··· 426 440 return False 427 441 428 442 # 5) Single-file weights fallback (includes GGUF) 429 - weight_files = list(model_path.rglob("*.safetensors")) + list(model_path.rglob("*.bin")) + list(model_path.rglob("*.gguf")) 443 + weight_files = ( 444 + list(model_path.rglob("*.safetensors")) 445 + + list(model_path.rglob("*.bin")) 446 + + list(model_path.rglob("*.gguf")) 447 + ) 430 448 # Exclude known pattern shards from consideration (handled above) 431 449 filtered_weights = [] 432 450 for f in weight_files: ··· 444 462 ok, _ = check_lfs_corruption(model_path) 445 463 return ok 446 464 465 + 447 466 def check_lfs_corruption(model_path): 448 467 """Recursively scan for Git LFS pointer files (suspiciously small files).""" 449 468 corrupted_files = [] 450 469 for file_path in model_path.rglob("*"): 451 470 try: 452 471 if file_path.is_file() and file_path.stat().st_size < 200: 453 - with open(file_path, 'rb') as f: 472 + with open(file_path, "rb") as f: 454 473 header = f.read(200) 455 - if b'version https://git-lfs.github.com/spec/v1' in header: 474 + if b"version https://git-lfs.github.com/spec/v1" in header: 456 475 corrupted_files.append(str(file_path.relative_to(model_path))) 457 476 except Exception: 458 477 # Ignore unreadable files in corruption scan, keep conservative ··· 461 480 return False, f"LFS pointers instead of files: {', '.join(corrupted_files)}" 462 481 return True, "No LFS corruption detected" 463 482 483 + 464 484 def check_model_health(model_spec): 465 485 model_path, model_name, commit_hash = resolve_single_model(model_spec) 466 486 if not model_path: 467 487 # resolve_single_model already printed the appropriate error message 468 488 return False 469 - 489 + 470 490 print(f"Checking model: {model_name}") 471 491 if commit_hash: 472 492 print(f"Hash: {commit_hash}") 473 - 493 + 474 494 # Use the robust health check 475 495 if is_model_healthy(model_spec): 476 496 print("\n[OK] Model is healthy and usable!") ··· 478 498 else: 479 499 # Detailed diagnosis for WHY it's unhealthy 480 500 print("\n[ERROR] Model is corrupted. Detailed diagnosis:") 481 - 501 + 482 502 # Check config.json 483 503 config_path = model_path / "config.json" 484 504 if not config_path.exists(): ··· 493 513 print(" - config.json found and valid") 494 514 except (OSError, json.JSONDecodeError): 495 515 print(" - config.json exists but contains invalid JSON") 496 - 516 + 497 517 # Check weight files (including gguf support like is_model_healthy) 498 - weight_files = list(model_path.glob("*.safetensors")) + list(model_path.glob("*.bin")) + list(model_path.glob("*.gguf")) 518 + weight_files = ( 519 + list(model_path.glob("*.safetensors")) 520 + + list(model_path.glob("*.bin")) 521 + + list(model_path.glob("*.gguf")) 522 + ) 499 523 if not weight_files: 500 - weight_files = list(model_path.glob("**/*.safetensors")) + list(model_path.glob("**/*.bin")) + list(model_path.glob("**/*.gguf")) 501 - 524 + weight_files = ( 525 + list(model_path.glob("**/*.safetensors")) 526 + + list(model_path.glob("**/*.bin")) 527 + + list(model_path.glob("**/*.gguf")) 528 + ) 529 + 502 530 if weight_files: 503 531 total_size = sum(f.stat().st_size for f in weight_files) 504 532 size_mb = total_size / (1024 * 1024) 505 - print(f" - Model weights found ({len(weight_files)} files, {size_mb:.1f}MB)") 533 + print( 534 + f" - Model weights found ({len(weight_files)} files, {size_mb:.1f}MB)" 535 + ) 506 536 elif (model_path / "model.safetensors.index.json").exists(): 507 537 # Check multi-file model 508 538 try: 509 539 with open(model_path / "model.safetensors.index.json") as f: 510 540 index = json.load(f) 511 - if 'weight_map' in index: 512 - referenced_files = set(index['weight_map'].values()) 513 - existing_files = [f for f in referenced_files if (model_path / f).exists()] 541 + if "weight_map" in index: 542 + referenced_files = set(index["weight_map"].values()) 543 + existing_files = [ 544 + f for f in referenced_files if (model_path / f).exists() 545 + ] 514 546 if existing_files: 515 - total_size = sum((model_path / f).stat().st_size for f in existing_files) 547 + total_size = sum( 548 + (model_path / f).stat().st_size for f in existing_files 549 + ) 516 550 size_mb = total_size / (1024 * 1024) 517 - print(f" - Multi-file weights ({len(existing_files)}/{len(referenced_files)} files, {size_mb:.1f}MB)") 551 + print( 552 + f" - Multi-file weights ({len(existing_files)}/{len(referenced_files)} files, {size_mb:.1f}MB)" 553 + ) 518 554 if len(existing_files) < len(referenced_files): 519 555 print(" - Incomplete multi-file model") 520 556 else: 521 - print(" - Multi-file model index found but no weight files exist") 557 + print( 558 + " - Multi-file model index found but no weight files exist" 559 + ) 522 560 else: 523 561 print(" - Multi-file model index is invalid") 524 562 except Exception as e: 525 563 print(f" - Multi-file model index error: {e}") 526 564 else: 527 565 print(" - No model weights found (.safetensors, .bin, .gguf)") 528 - 566 + 529 567 # Check LFS corruption 530 568 lfs_ok, lfs_msg = check_lfs_corruption(model_path) 531 569 if not lfs_ok: 532 570 print(f" - {lfs_msg}") 533 571 else: 534 572 print(f" - {lfs_msg}") 535 - 573 + 536 574 # Show framework 537 575 framework = detect_framework(model_path.parent.parent, model_name) 538 576 print(f" - Framework: {framework}") 539 - 577 + 540 578 # Offer deletion for corrupted models 541 579 confirm = input("\nModel appears corrupted. Delete? [y/N] ") 542 580 if confirm.lower() == "y": 543 581 import errno 544 582 import shutil 583 + 545 584 try: 546 585 if commit_hash: 547 586 # Delete specific hash/snapshot ··· 559 598 print(f"Model {model_name} deleted.") 560 599 except PermissionError as e: 561 600 print(f"[ERROR] Permission denied: Cannot delete {e.filename}") 562 - print(" Try running with appropriate permissions or manually delete the directory.") 601 + print( 602 + " Try running with appropriate permissions or manually delete the directory." 603 + ) 563 604 except OSError as e: 564 605 if e.errno == errno.ENOTEMPTY: 565 606 print(f"[ERROR] Directory not empty: {e.filename}") ··· 569 610 else: 570 611 print(f"[ERROR] OS Error while deleting: {e}") 571 612 except Exception as e: 572 - print(f"[ERROR] Unexpected error while deleting: {type(e).__name__}: {e}") 573 - 613 + print( 614 + f"[ERROR] Unexpected error while deleting: {type(e).__name__}: {e}" 615 + ) 616 + 574 617 return False 618 + 575 619 576 620 def check_all_models_health(): 577 621 models = [d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--")] ··· 605 649 print(" python mlx_knife.cli health <model-name> # Show details") 606 650 return len(problematic_models) == 0 607 651 608 - def list_models(show_all=False, framework_filter=None, show_health=False, single_model=None, verbose=False): 652 + 653 + def list_models( 654 + show_all=False, 655 + framework_filter=None, 656 + show_health=False, 657 + single_model=None, 658 + verbose=False, 659 + ): 609 660 if single_model: 610 661 # Try exact match first 611 662 expanded_model = expand_model_name(single_model) ··· 616 667 else: 617 668 # If exact match fails, do partial name matching 618 669 if not MODEL_CACHE.exists(): 619 - print(f"No models found matching '{single_model}' - cache directory doesn't exist yet.") 670 + print( 671 + f"No models found matching '{single_model}' - cache directory doesn't exist yet." 672 + ) 620 673 print("Use 'mlxk pull <model-name>' to download models first.") 621 674 return 622 - all_models = [d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--")] 675 + all_models = [ 676 + d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--") 677 + ] 623 678 matching_models = [] 624 - 679 + 625 680 for model_dir in all_models: 626 681 hf_name = cache_dir_to_hf(model_dir.name) 627 682 # Check if the pattern appears in the model name (case insensitive) 628 683 if single_model.lower() in hf_name.lower(): 629 684 matching_models.append(model_dir) 630 - 685 + 631 686 if not matching_models: 632 687 print(f"No models found matching '{single_model}'!") 633 688 return 634 - 689 + 635 690 models = matching_models 636 691 else: 637 692 if not MODEL_CACHE.exists(): ··· 644 699 return 645 700 if show_health: 646 701 if show_all: 647 - print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10} {'HEALTH':<8}") 702 + print( 703 + f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10} {'HEALTH':<8}" 704 + ) 648 705 else: 649 - print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'HEALTH':<8}") 706 + print( 707 + f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'HEALTH':<8}" 708 + ) 650 709 else: 651 710 if show_all: 652 - print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10}") 711 + print( 712 + f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10}" 713 + ) 653 714 else: 654 715 print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15}") 655 716 for m in sorted(models, key=lambda x: x.stat().st_mtime, reverse=True): ··· 671 732 display_name = hf_name 672 733 if hf_name.startswith("mlx-community/") and not verbose: 673 734 # For MLX models, hide prefix unless verbose is set 674 - display_name = hf_name[len("mlx-community/"):] 735 + display_name = hf_name[len("mlx-community/") :] 675 736 health_status = "" 676 737 if show_health: 677 738 health_status = "[OK]" if is_model_healthy(hf_name) else "[ERR]" 678 739 if show_all: 679 - print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10} {health_status:<8}") 740 + print( 741 + f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10} {health_status:<8}" 742 + ) 680 743 else: 681 - print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {health_status:<8}") 744 + print( 745 + f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {health_status:<8}" 746 + ) 682 747 else: 683 748 if show_all: 684 - print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10}") 749 + print( 750 + f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10}" 751 + ) 685 752 else: 686 753 print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15}") 687 754 688 - def run_model(model_spec, prompt=None, interactive=False, temperature=0.7, 689 - max_tokens=500, top_p=0.9, repetition_penalty=1.1, stream=True, 690 - use_chat_template=True, hide_reasoning=False, verbose=False): 755 + 756 + def run_model( 757 + model_spec, 758 + prompt=None, 759 + interactive=False, 760 + temperature=0.7, 761 + max_tokens=500, 762 + top_p=0.9, 763 + repetition_penalty=1.1, 764 + stream=True, 765 + use_chat_template=True, 766 + hide_reasoning=False, 767 + verbose=False, 768 + ): 691 769 """Run an MLX model with enhanced features. 692 - 770 + 693 771 Args: 694 772 model_spec: Model specification (name[@hash]) 695 773 prompt: Input prompt (if None and not interactive, enters interactive mode) ··· 730 808 ) 731 809 except ImportError: 732 810 # Fallback to subprocess if mlx_runner is not available 733 - print("[WARNING] Enhanced runner not available, falling back to subprocess mode") 811 + print( 812 + "[WARNING] Enhanced runner not available, falling back to subprocess mode" 813 + ) 734 814 print(f"Running model: {model_name}") 735 815 if commit_hash: 736 816 print(f"Hash: {commit_hash}") ··· 741 821 prompt = prompt or "Hello" 742 822 743 823 print(f"Prompt: {prompt}\n") 744 - os.system(f'python -m mlx_lm generate --model "{model_path}" --prompt "{prompt}"') 824 + os.system( 825 + f'python -m mlx_lm generate --model "{model_path}" --prompt "{prompt}"' 826 + ) 827 + 745 828 746 829 def show_model(model_spec, show_files=False, show_config=False): 747 830 """Show detailed information about a specific model.""" ··· 774 857 model_type = detect_model_type(model_path.parent.parent, model_name) 775 858 print(f"Framework: {framework}") 776 859 print(f"Type: {model_type}") 777 - 860 + 778 861 # Quantization info (if available) 779 862 quant_info = get_quantization_info(model_path) 780 863 if quant_info: ··· 787 870 main_config.append(f"{quant_info['bits']}-bit") 788 871 if "group_size" in quant_info: 789 872 main_config.append(f"group_size: {quant_info['group_size']}") 790 - 873 + 791 874 if main_config: 792 875 print(f"Quantization: {', '.join(main_config)}") 793 876 if "mode" in quant_info: 794 - print(f" Advanced mode '{quant_info['mode']}' (requires MLX ≥0.29.0, MLX-LM ≥0.27.0)") 877 + print( 878 + f" Advanced mode '{quant_info['mode']}' (requires MLX ≥0.29.0, MLX-LM ≥0.27.0)" 879 + ) 795 880 else: 796 881 print(f"Quantization: {quant_info}") 797 882 ··· 807 892 config_data = json.load(f) 808 893 809 894 # 1. Check for explicit quantization field (MLX style) 810 - if "quantization" in config_data and isinstance(config_data["quantization"], dict): 895 + if "quantization" in config_data and isinstance( 896 + config_data["quantization"], dict 897 + ): 811 898 quant = config_data["quantization"] 812 899 if "bits" in quant: 813 900 quantization_info = f"{quant['bits']}-bit" ··· 878 965 quantization_info = "Multiple GGUF variants available" 879 966 precision_info = "gguf (see variants below)" 880 967 elif len(gguf_variants) == 1: 881 - quantization_info = gguf_variants[0].split(' (')[0] 968 + quantization_info = gguf_variants[0].split(" (")[0] 882 969 precision_info = "gguf" 883 970 else: 884 971 quantization_info = "GGUF format (quantization unknown)" ··· 915 1002 if not (model_path / "config.json").exists(): 916 1003 issues.append("config.json missing") 917 1004 918 - weight_files = list(model_path.glob("*.safetensors")) + list(model_path.glob("*.bin")) + list(model_path.glob("*.gguf")) 1005 + weight_files = ( 1006 + list(model_path.glob("*.safetensors")) 1007 + + list(model_path.glob("*.bin")) 1008 + + list(model_path.glob("*.gguf")) 1009 + ) 919 1010 if not weight_files: 920 - weight_files = list(model_path.glob("**/*.safetensors")) + list(model_path.glob("**/*.bin")) + list(model_path.glob("**/*.gguf")) 1011 + weight_files = ( 1012 + list(model_path.glob("**/*.safetensors")) 1013 + + list(model_path.glob("**/*.bin")) 1014 + + list(model_path.glob("**/*.gguf")) 1015 + ) 921 1016 if not weight_files: 922 1017 index_file = model_path / "model.safetensors.index.json" 923 1018 if not index_file.exists(): ··· 974 1069 975 1070 return True 976 1071 1072 + 977 1073 def rm_model(model_spec, force=False): 978 1074 original_spec = model_spec 979 - 1075 + 980 1076 # First try to resolve using fuzzy matching 981 1077 resolved_path, resolved_name, resolved_hash = resolve_single_model(model_spec) 982 - 1078 + 983 1079 if not resolved_path: 984 1080 # resolve_single_model already printed the error message for most cases 985 1081 # But ensure we always provide feedback to the user 986 1082 print(f"Model '{original_spec}' not found or corrupted.") 987 1083 return 988 - 1084 + 989 1085 # Use the resolved model name for deletion 990 1086 model_name = resolved_name 991 1087 commit_hash = resolved_hash 992 - 993 - 1088 + 994 1089 # Confirm on auto-expansion (if the resolved name is different from input) 995 1090 base_spec = original_spec.split("@")[0] if "@" in original_spec else original_spec 996 1091 if base_spec != model_name and "/" not in base_spec: ··· 998 1093 if confirm.lower() == "n": 999 1094 print("Delete aborted.") 1000 1095 return 1001 - 1096 + 1002 1097 base_cache_dir = MODEL_CACHE / hf_to_cache_dir(model_name) 1003 1098 # This should exist since resolve_single_model succeeded, but double-check 1004 1099 if not base_cache_dir.exists(): ··· 1021 1116 else: 1022 1117 confirm = input(f"Delete hash {commit_hash} of model {model_name}? [y/N] ") 1023 1118 confirm_delete = confirm.lower() == "y" 1024 - 1119 + 1025 1120 if confirm_delete: 1026 1121 # Issue #23 Fix: Delete entire model directory, not just the snapshot 1027 1122 # This prevents the double-execution problem where refs/ remain intact 1028 1123 shutil.rmtree(base_cache_dir) 1029 1124 print(f"{model_name}@{commit_hash} deleted") 1030 - 1125 + 1031 1126 # Clean up associated lock files 1032 1127 try: 1033 1128 _cleanup_model_locks(model_name, force) ··· 1040 1135 if force: 1041 1136 confirm_delete = True 1042 1137 else: 1043 - confirm = input(f"Delete entire model {model_name} ({base_cache_dir})? [y/N] ") 1138 + confirm = input( 1139 + f"Delete entire model {model_name} ({base_cache_dir})? [y/N] " 1140 + ) 1044 1141 confirm_delete = confirm.lower() == "y" 1045 - 1142 + 1046 1143 if confirm_delete: 1047 1144 shutil.rmtree(base_cache_dir) 1048 1145 print(f"Model {model_name} completely deleted.") 1049 - 1146 + 1050 1147 # Clean up associated lock files 1051 1148 try: 1052 1149 _cleanup_model_locks(model_name, force) ··· 1058 1155 1059 1156 def _cleanup_model_locks(model_name, force=False): 1060 1157 """Clean up HuggingFace lock files for a deleted model. 1061 - 1158 + 1062 1159 Args: 1063 1160 model_name: The model name (e.g. 'microsoft/DialoGPT-small') 1064 1161 force: If True, delete without asking. If False, prompt user. 1065 1162 """ 1066 1163 locks_dir = MODEL_CACHE / ".locks" / hf_to_cache_dir(model_name) 1067 - 1164 + 1068 1165 if not locks_dir.exists(): 1069 1166 return # No locks to clean up 1070 - 1167 + 1071 1168 # Count lock files 1072 1169 try: 1073 1170 lock_files = list(locks_dir.iterdir()) 1074 1171 if not lock_files: 1075 1172 return # Empty directory 1076 - 1173 + 1077 1174 if force: 1078 1175 # Delete without asking 1079 1176 shutil.rmtree(locks_dir) ··· 1086 1183 print(f"Cache files cleaned up ({len(lock_files)} files).") 1087 1184 else: 1088 1185 print("Cache files left intact.") 1089 - 1186 + 1090 1187 except Exception as e: 1091 1188 print(f"Warning: Could not clean up cache files: {e}") 1092 -
+31 -11
server/main.py
··· 1 1 import uvicorn 2 + 3 + # from backend import linux 2 4 from .api import app 3 - from .config import PORT 5 + from .config import PORT 4 6 import logging 5 7 import sys 6 - from fastapi import Request 8 + from fastapi import Request 9 + from . import runtime 7 10 8 - # --- logging setup --- 9 11 logging.basicConfig( 10 12 level=logging.INFO, 11 13 format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ··· 14 16 logger = logging.getLogger("app") 15 17 16 18 17 - # --- middleware for request logging --- 18 19 @app.middleware("http") 19 20 async def log_requests(request: Request, call_next): 20 21 try: ··· 22 23 except Exception: 23 24 body = None 24 25 25 - logger.info({ 26 - "method": request.method, 27 - "url": str(request.url), 28 - "client": request.client.host, 29 - "body": body, 30 - }) 26 + logger.info( 27 + { 28 + "method": request.method, 29 + "url": str(request.url), 30 + "client": request.client.host, 31 + "body": body, 32 + } 33 + ) 31 34 32 35 response = await call_next(request) 33 36 logger.info(f"<-- {request.method} {request.url.path} {response.status_code}") 34 37 return response 35 38 39 + def get_backend(): 40 + """ 41 + Dynamically choose which backend should be used depending on the OS 42 + """ 43 + if sys.platform == "darwin": 44 + from .backend import mlx 45 + logger.info("Using MLX backend (MacOs)") 46 + return mlx 47 + elif sys.platform.startswith("linux"): 48 + from .backend import linux 49 + logger.info(f"Using linux backend {sys.platform}") 50 + return linux 51 + else: 52 + raise RuntimeError(f"Unsupported OS: {sys.platform}") 53 + 54 + runtime.backend = get_backend() 55 + 36 56 def run(): 37 57 uvicorn.run(app, host="127.0.0.1", port=PORT) 38 58 59 + 39 60 if __name__ == "__main__": 40 61 run() 41 -
+254 -176
server/mlx_runner.py server/backend/mlx_runner.py
··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 - 23 1 """ 24 2 Enhanced MLX model runner with direct API integration. 25 3 Provides ollama-like run experience with streaming and interactive chat. 26 4 """ 27 5 6 + import sys 28 7 import json 29 8 import os 30 9 import time ··· 32 11 from pathlib import Path 33 12 from typing import Dict, Optional 34 13 35 - import mlx.core as mx 14 + if sys.platform == "darwin": 15 + import mlx.core as mx 16 + else: 17 + mx = None 36 18 from mlx_lm import load 37 19 from mlx_lm.generate import generate_step 38 20 from mlx_lm.sample_utils import make_repetition_penalty, make_sampler 39 21 40 - from .reasoning_utils import ReasoningExtractor, StreamingReasoningParser 22 + from ..reasoning_utils import ReasoningExtractor, StreamingReasoningParser 41 23 42 24 43 25 def get_model_context_length(model_path: str) -> int: 44 26 """Extract max_position_embeddings from model config. 45 - 27 + 46 28 Args: 47 29 model_path: Path to the MLX model directory 48 - 30 + 49 31 Returns: 50 32 Maximum context length for the model (defaults to 4096 if not found) 51 33 """ 52 34 config_path = os.path.join(model_path, "config.json") 53 - 35 + 54 36 try: 55 37 with open(config_path) as f: 56 38 config = json.load(f) 57 - 39 + 58 40 # Try various common config keys for context length 59 41 context_keys = [ 60 42 "max_position_embeddings", 61 43 "n_positions", 62 44 "context_length", 63 45 "max_sequence_length", 64 - "seq_len" 46 + "seq_len", 65 47 ] 66 - 48 + 67 49 for key in context_keys: 68 50 if key in config: 69 51 return config[key] 70 - 52 + 71 53 # If no context length found, return reasonable default 72 54 return 4096 73 - 55 + 74 56 except (FileNotFoundError, json.JSONDecodeError, KeyError): 75 57 # Return default if config can't be read 76 58 return 4096 ··· 79 61 class MLXRunner: 80 62 """Direct MLX model runner with streaming and interactive capabilities.""" 81 63 82 - def __init__(self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False): 64 + def __init__( 65 + self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False 66 + ): 83 67 """Initialize the runner with a model. 84 - 68 + 85 69 Args: 86 70 model_path: Path to the MLX model directory 87 71 adapter_path: Optional path to LoRA adapter ··· 107 91 def __enter__(self): 108 92 """Context manager entry - loads the model.""" 109 93 if self._context_entered: 110 - raise RuntimeError("MLXRunner context manager cannot be entered multiple times") 111 - 94 + raise RuntimeError( 95 + "MLXRunner context manager cannot be entered multiple times" 96 + ) 97 + 112 98 self._context_entered = True 113 99 try: 114 100 self.load_model() ··· 146 132 try: 147 133 # Load model and tokenizer 148 134 self.model, self.tokenizer = load( 149 - str(self.model_path), 150 - adapter_path=self.adapter_path 135 + str(self.model_path), adapter_path=self.adapter_path 151 136 ) 152 137 153 138 load_time = time.time() - start_time ··· 156 141 157 142 if self.verbose: 158 143 print(f"Model loaded in {load_time:.1f}s") 159 - print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total") 144 + print( 145 + f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total" 146 + ) 160 147 161 148 # Extract stop tokens from tokenizer 162 149 self._extract_stop_tokens() 163 - 150 + 164 151 # Extract context length from model config 165 152 self._context_length = get_model_context_length(str(self.model_path)) 166 - 153 + 167 154 if self.verbose: 168 155 print(f"Model context length: {self._context_length} tokens") 169 - 156 + 170 157 self._model_loaded = True 171 - 158 + 172 159 except Exception as e: 173 160 # Ensure partial state is cleaned up on failure 174 161 self.model = None ··· 177 164 self._model_loaded = False 178 165 # Clear any memory that might have been allocated 179 166 mx.clear_cache() 180 - raise RuntimeError(f"Failed to load model from {self.model_path}: {e}") from e 167 + raise RuntimeError( 168 + f"Failed to load model from {self.model_path}: {e}" 169 + ) from e 181 170 182 171 def _extract_stop_tokens(self): 183 172 """Extract stop tokens from the tokenizer dynamically. 184 - 173 + 185 174 This method identifies ALL tokens that should stop generation: 186 175 1. Official EOS token from tokenizer config 187 176 2. Message-end tokens from training (e.g., <|end|> for MXFP4) 188 177 3. Common stop tokens across models 189 178 """ 190 179 self._stop_tokens = set() 191 - self._message_end_tokens = set() # Tokens that end messages but not conversations 180 + self._message_end_tokens = ( 181 + set() 182 + ) # Tokens that end messages but not conversations 192 183 193 184 # Primary source: eos_token 194 - eos_token = getattr(self.tokenizer, 'eos_token', None) 185 + eos_token = getattr(self.tokenizer, "eos_token", None) 195 186 if eos_token: 196 187 self._stop_tokens.add(eos_token) 197 188 198 189 # Also check pad_token if it's different from eos_token 199 - pad_token = getattr(self.tokenizer, 'pad_token', None) 190 + pad_token = getattr(self.tokenizer, "pad_token", None) 200 191 if pad_token and pad_token != eos_token: 201 192 self._stop_tokens.add(pad_token) 202 193 203 194 # Check additional_special_tokens 204 - if hasattr(self.tokenizer, 'additional_special_tokens'): 195 + if hasattr(self.tokenizer, "additional_special_tokens"): 205 196 for token in self.tokenizer.additional_special_tokens: 206 197 if token and isinstance(token, str): 207 198 # Only add tokens that look like stop/end tokens 208 - if any(keyword in token.lower() for keyword in ['end', 'stop', 'eot']): 199 + if any( 200 + keyword in token.lower() for keyword in ["end", "stop", "eot"] 201 + ): 209 202 self._stop_tokens.add(token) 210 - 203 + 211 204 # MLX-LM 0.27.0+: Extract tokens from added_tokens_decoder (comprehensive source) 212 - if hasattr(self.tokenizer, 'added_tokens_decoder'): 205 + if hasattr(self.tokenizer, "added_tokens_decoder"): 213 206 for _token_id, token_info in self.tokenizer.added_tokens_decoder.items(): 214 - if isinstance(token_info, dict) and 'content' in token_info: 215 - token_content = token_info['content'] 207 + if isinstance(token_info, dict) and "content" in token_info: 208 + token_content = token_info["content"] 216 209 if token_content and isinstance(token_content, str): 217 210 token_lower = token_content.lower() 218 - 211 + 219 212 # NOTE: <|end|> is NOT a stop token for MXFP4 models! 220 213 # It's a separator between reasoning and final answer 221 - if token_content == '<|end|>': 214 + if token_content == "<|end|>": 222 215 self._message_end_tokens.add(token_content) 223 216 # Do NOT add as stop token - let model continue to final answer 224 - 217 + 225 218 # Look for tokens that could be end/stop tokens 226 219 # Expanded patterns for MLX-LM 0.27.0 token varieties 227 220 # EXCLUDE <|end|> for MXFP4 models as it's a reasoning separator 228 - end_patterns = ['stop', 'eot', 'return', 'finish', 'done', 'im_end'] 221 + end_patterns = [ 222 + "stop", 223 + "eot", 224 + "return", 225 + "finish", 226 + "done", 227 + "im_end", 228 + ] 229 229 if any(pattern in token_lower for pattern in end_patterns): 230 230 # Decide if it's a message-end or conversation-end token 231 - if 'im_end' in token_lower: 231 + if "im_end" in token_lower: 232 232 self._message_end_tokens.add(token_content) 233 233 self._stop_tokens.add(token_content) 234 234 # Special handling for 'end' pattern - more selective 235 - elif 'end' in token_lower and token_content != '<|end|>': 235 + elif "end" in token_lower and token_content != "<|end|>": 236 236 # Only add non-<|end|> tokens with 'end' in them 237 237 self._stop_tokens.add(token_content) 238 - 238 + 239 239 # Special case: control tokens in |..| format 240 - elif token_content.startswith('<|') and token_content.endswith('|>'): 240 + elif token_content.startswith("<|") and token_content.endswith( 241 + "|>" 242 + ): 241 243 # Be inclusive with control tokens that might stop generation 242 - if any(pattern in token_lower for pattern in ['end', 'return', 'stop', 'finish']): 244 + if any( 245 + pattern in token_lower 246 + for pattern in ["end", "return", "stop", "finish"] 247 + ): 243 248 self._stop_tokens.add(token_content) 244 249 245 250 # Model-specific handling based on known patterns 246 251 # Use reasoning_utils for reasoning model detection and patterns 247 - from .reasoning_utils import ReasoningExtractor 248 - 249 - if hasattr(self.tokenizer, 'name_or_path'): 250 - name_or_path = str(getattr(self.tokenizer, 'name_or_path', '')).lower() 252 + from ..reasoning_utils import ReasoningExtractor 253 + 254 + if hasattr(self.tokenizer, "name_or_path"): 255 + name_or_path = str(getattr(self.tokenizer, "name_or_path", "")).lower() 251 256 model_type = ReasoningExtractor.detect_model_type(name_or_path) 252 - 257 + 253 258 if model_type: 254 259 # This is a reasoning model 255 260 self._is_reasoning_model = True 256 - 261 + 257 262 # Get patterns from reasoning_utils 258 263 if model_type in ReasoningExtractor.PATTERNS: 259 - markers = ReasoningExtractor.PATTERNS[model_type]['markers'] 260 - self._reasoning_start = markers.get('reasoning_start') 261 - self._reasoning_end = markers.get('reasoning_end') 262 - self._final_start = markers.get('final_marker') 263 - 264 + markers = ReasoningExtractor.PATTERNS[model_type]["markers"] 265 + self._reasoning_start = markers.get("reasoning_start") 266 + self._reasoning_end = markers.get("reasoning_end") 267 + self._final_start = markers.get("final_marker") 268 + 264 269 # For reasoning models, remove reasoning_end from stop tokens 265 270 if self._reasoning_end: 266 271 self._stop_tokens.discard(self._reasoning_end) 267 - 272 + 268 273 # Add proper stop token for this model type 269 - if model_type == 'gpt-oss': 270 - if '<|return|>' not in self._stop_tokens: 271 - self._stop_tokens.add('<|return|>') 274 + if model_type == "gpt-oss": 275 + if "<|return|>" not in self._stop_tokens: 276 + self._stop_tokens.add("<|return|>") 272 277 else: 273 278 self._is_reasoning_model = False 274 279 else: 275 280 self._is_reasoning_model = False 276 281 277 282 # Add common stop tokens that might not be in special tokens 278 - common_stop_tokens = {'</s>', '<|endoftext|>', '<|im_end|>', '<|eot_id|>'} 279 - 283 + common_stop_tokens = {"</s>", "<|endoftext|>", "<|im_end|>", "<|eot_id|>"} 284 + 280 285 # Add chat-specific stop tokens to prevent model self-conversations 281 286 # Based on our _format_conversation() format: "Human:" and "Assistant:" 282 287 # Also include "You:" as models might use UI-visible format 283 288 # Include single-letter variations (H:, A:, Y:) that some models use 284 289 chat_stop_tokens = { 285 - '\nHuman:', '\nAssistant:', '\nYou:', 286 - '\n\nHuman:', '\n\nAssistant:', '\n\nYou:', 287 - '\nH:', '\nA:', '\nY:', # Single-letter variations 288 - '\n\nH:', '\n\nA:', '\n\nY:' 290 + "\nHuman:", 291 + "\nAssistant:", 292 + "\nYou:", 293 + "\n\nHuman:", 294 + "\n\nAssistant:", 295 + "\n\nYou:", 296 + "\nH:", 297 + "\nA:", 298 + "\nY:", # Single-letter variations 299 + "\n\nH:", 300 + "\n\nA:", 301 + "\n\nY:", 289 302 } 290 303 291 304 # Add common stop tokens only if they decode to themselves (i.e., they're single tokens) ··· 299 312 self._stop_tokens.add(token) 300 313 except: 301 314 pass 302 - 315 + 303 316 # Store chat stop tokens separately - only used in interactive chat mode 304 317 # This prevents stopping mid-story when user asks for dialogues 305 318 self._chat_stop_tokens = list(chat_stop_tokens) ··· 320 333 321 334 def cleanup(self): 322 335 """Clean up model resources and clear GPU memory. 323 - 336 + 324 337 This method is safe to call multiple times and handles partial state cleanup. 325 338 """ 326 339 if self.verbose and self._model_loaded: ··· 342 355 343 356 # Force garbage collection and clear MLX cache 344 357 import gc 358 + 345 359 gc.collect() 346 360 try: 347 361 mx.clear_cache() ··· 350 364 351 365 if self.verbose: 352 366 memory_after = mx.get_active_memory() / 1024**3 353 - if 'memory_before' in locals(): 367 + if "memory_before" in locals(): 354 368 memory_freed = memory_before - memory_after 355 - print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)") 369 + print( 370 + f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)" 371 + ) 356 372 else: 357 373 print(f"Cleanup complete (memory after: {memory_after:.1f}GB)") 358 374 359 - def get_effective_max_tokens(self, requested_tokens: Optional[int], interactive: bool = False) -> int: 375 + def get_effective_max_tokens( 376 + self, requested_tokens: Optional[int], interactive: bool = False 377 + ) -> int: 360 378 """Get effective max tokens based on model context and usage mode. 361 - 379 + 362 380 Args: 363 381 requested_tokens: The requested max tokens (None if user didn't specify --max-tokens) 364 382 interactive: True if this is interactive mode (gets full context length) 365 - 383 + 366 384 Returns: 367 385 Effective max tokens to use 368 386 """ ··· 371 389 fallback = 4096 if interactive else 2048 372 390 if self.verbose: 373 391 if requested_tokens is None: 374 - print(f"[WARNING] Model context length unknown, using fallback: {fallback} tokens") 392 + print( 393 + f"[WARNING] Model context length unknown, using fallback: {fallback} tokens" 394 + ) 375 395 else: 376 - print(f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens") 396 + print( 397 + f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens" 398 + ) 377 399 return requested_tokens if requested_tokens is not None else fallback 378 - 400 + 379 401 if interactive: 380 402 if requested_tokens is None: 381 403 # User didn't specify --max-tokens: use full model context ··· 402 424 hide_reasoning: bool = False, 403 425 ) -> Iterator[str]: 404 426 """Generate text with streaming output. 405 - 427 + 406 428 Args: 407 429 prompt: Input prompt 408 430 max_tokens: Maximum tokens to generate ··· 413 435 use_chat_template: Apply tokenizer's chat template if available 414 436 use_chat_stop_tokens: Include chat turn markers as stop tokens (for interactive mode) 415 437 interactive: True if this is interactive mode (affects token limits) 416 - 438 + 417 439 Yields: 418 440 Generated tokens as they are produced 419 441 """ 420 442 if not self.model or not self.tokenizer: 421 443 raise RuntimeError("Model not loaded. Call load_model() first.") 422 - 444 + 423 445 # Initialize reasoning parser if this is a reasoning model 424 446 reasoning_parser = None 425 447 if self._is_reasoning_model: 426 448 model_type = ReasoningExtractor.detect_model_type( 427 - getattr(self.tokenizer, 'name_or_path', '') or '' 449 + getattr(self.tokenizer, "name_or_path", "") or "" 450 + ) 451 + reasoning_parser = StreamingReasoningParser( 452 + model_type, hide_reasoning=hide_reasoning 428 453 ) 429 - reasoning_parser = StreamingReasoningParser(model_type, hide_reasoning=hide_reasoning) 430 454 431 455 # Apply context-aware token limits 432 456 effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive) 433 457 434 458 # Apply chat template if available and requested 435 - if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template: 459 + if ( 460 + use_chat_template 461 + and hasattr(self.tokenizer, "chat_template") 462 + and self.tokenizer.chat_template 463 + ): 436 464 messages = [{"role": "user", "content": prompt}] 437 465 formatted_prompt = self.tokenizer.apply_chat_template( 438 - messages, 439 - tokenize=False, 440 - add_generation_prompt=True 466 + messages, tokenize=False, add_generation_prompt=True 441 467 ) 442 468 else: 443 469 formatted_prompt = prompt ··· 479 505 480 506 for token, _ in generator: 481 507 # Token might be an array or an int 482 - token_id = token.item() if hasattr(token, 'item') else token 508 + token_id = token.item() if hasattr(token, "item") else token 483 509 generated_tokens.append(token_id) 484 510 485 511 # Use a sliding window approach for efficiency ··· 493 519 if start_idx == 0: 494 520 # We're still within the context window 495 521 if window_text.startswith(previous_decoded): 496 - new_text = window_text[len(previous_decoded):] 522 + new_text = window_text[len(previous_decoded) :] 497 523 else: 498 524 new_text = self.tokenizer.decode([token_id]) 499 525 previous_decoded = window_text ··· 504 530 if len(window_tokens) > 1: 505 531 prefix = self.tokenizer.decode(window_tokens[:-1]) 506 532 if new_text.startswith(prefix): 507 - new_text = new_text[len(prefix):] 533 + new_text = new_text[len(prefix) :] 508 534 else: 509 535 new_text = self.tokenizer.decode([token_id]) 510 536 511 537 if new_text: 512 538 # Update accumulated response for stop token checking 513 539 accumulated_response += new_text 514 - 540 + 515 541 # Filter out stop tokens with priority: native first, then chat fallback 516 542 # Check native stop tokens FIRST in accumulated response (highest priority) 517 543 native_stop_tokens = self._stop_tokens if self._stop_tokens else [] ··· 522 548 # Calculate what text came before the stop token 523 549 text_before_stop = accumulated_response[:stop_pos] 524 550 # Calculate how much of that is new (not previously yielded) 525 - previously_yielded_length = len(accumulated_response) - len(new_text) 551 + previously_yielded_length = len(accumulated_response) - len( 552 + new_text 553 + ) 526 554 if len(text_before_stop) > previously_yielded_length: 527 555 # Yield only the new part before stop token 528 - new_part_before_stop = text_before_stop[previously_yielded_length:] 556 + new_part_before_stop = text_before_stop[ 557 + previously_yielded_length: 558 + ] 529 559 if new_part_before_stop: 530 560 if reasoning_parser: 531 561 # Process through reasoning parser for formatting 532 - for formatted_token in reasoning_parser.process_token(new_part_before_stop): 562 + for ( 563 + formatted_token 564 + ) in reasoning_parser.process_token( 565 + new_part_before_stop 566 + ): 533 567 yield formatted_token 534 568 else: 535 569 yield new_part_before_stop 536 570 return # Stop generation without yielding stop token 537 - 571 + 538 572 # Only check chat stop tokens if no native stop token found (fallback) 539 573 if use_chat_stop_tokens and self._chat_stop_tokens: 540 574 for stop_token in self._chat_stop_tokens: ··· 544 578 # Calculate what text came before the stop token 545 579 text_before_stop = accumulated_response[:stop_pos] 546 580 # Calculate how much of that is new (not previously yielded) 547 - previously_yielded_length = len(accumulated_response) - len(new_text) 581 + previously_yielded_length = len(accumulated_response) - len( 582 + new_text 583 + ) 548 584 if len(text_before_stop) > previously_yielded_length: 549 585 # Yield only the new part before stop token 550 - new_part_before_stop = text_before_stop[previously_yielded_length:] 586 + new_part_before_stop = text_before_stop[ 587 + previously_yielded_length: 588 + ] 551 589 if new_part_before_stop: 552 590 if reasoning_parser: 553 591 # Process through reasoning parser for formatting 554 - for formatted_token in reasoning_parser.process_token(new_part_before_stop): 592 + for ( 593 + formatted_token 594 + ) in reasoning_parser.process_token( 595 + new_part_before_stop 596 + ): 555 597 yield formatted_token 556 598 else: 557 599 yield new_part_before_stop ··· 574 616 # Finalize reasoning parser if used 575 617 if reasoning_parser: 576 618 yield from reasoning_parser.finalize() 577 - 619 + 578 620 # Print generation statistics if verbose 579 621 if self.verbose: 580 622 generation_time = time.time() - start_time 581 - tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0 582 - print(f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)") 623 + tokens_per_second = ( 624 + tokens_generated / generation_time if generation_time > 0 else 0 625 + ) 626 + print( 627 + f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)" 628 + ) 583 629 584 630 def generate_batch( 585 631 self, ··· 593 639 interactive: bool = False, 594 640 ) -> str: 595 641 """Generate text in batch mode (non-streaming). 596 - 642 + 597 643 Args: 598 644 prompt: Input prompt 599 645 max_tokens: Maximum tokens to generate ··· 603 649 repetition_context_size: Context size for repetition penalty 604 650 use_chat_template: Apply tokenizer's chat template if available 605 651 interactive: True if this is interactive mode (affects token limits) 606 - 652 + 607 653 Returns: 608 654 Generated text 609 655 """ ··· 614 660 effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive) 615 661 616 662 # Apply chat template if available and requested 617 - if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template: 663 + if ( 664 + use_chat_template 665 + and hasattr(self.tokenizer, "chat_template") 666 + and self.tokenizer.chat_template 667 + ): 618 668 messages = [{"role": "user", "content": prompt}] 619 669 formatted_prompt = self.tokenizer.apply_chat_template( 620 - messages, 621 - tokenize=False, 622 - add_generation_prompt=True 670 + messages, tokenize=False, add_generation_prompt=True 623 671 ) 624 672 else: 625 673 formatted_prompt = prompt ··· 654 702 655 703 for token, _ in generator: 656 704 # Token might be an array or an int 657 - token_id = token.item() if hasattr(token, 'item') else token 705 + token_id = token.item() if hasattr(token, "item") else token 658 706 generated_tokens.append(token_id) 659 707 all_tokens.append(token_id) 660 708 ··· 667 715 668 716 # Remove the prompt part 669 717 if full_response.startswith(formatted_prompt): 670 - response = full_response[len(formatted_prompt):] 718 + response = full_response[len(formatted_prompt) :] 671 719 else: 672 720 # Fallback: just decode generated tokens 673 721 response = self.tokenizer.decode(generated_tokens) 674 722 675 723 # Apply end-token filtering (same logic as streaming mode for Issue #20) 676 - response = self._filter_end_tokens_from_response(response, use_chat_stop_tokens=False) 677 - 724 + response = self._filter_end_tokens_from_response( 725 + response, use_chat_stop_tokens=False 726 + ) 727 + 678 728 # Format reasoning models output 679 729 response = self._format_reasoning_response(response) 680 730 ··· 683 733 # Count tokens for statistics 684 734 if self.verbose: 685 735 tokens_generated = len(generated_tokens) 686 - tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0 687 - print(f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)") 736 + tokens_per_second = ( 737 + tokens_generated / generation_time if generation_time > 0 else 0 738 + ) 739 + print( 740 + f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)" 741 + ) 688 742 689 743 return response 690 744 ··· 698 752 use_chat_template: bool = True, 699 753 ): 700 754 """Run an interactive chat session. 701 - 755 + 702 756 Args: 703 757 system_prompt: Optional system prompt to prepend 704 758 max_tokens: Maximum tokens per response ··· 718 772 # Get user input 719 773 user_input = input("You: ").strip() 720 774 721 - if user_input.lower() in ['exit', 'quit', 'q']: 775 + if user_input.lower() in ["exit", "quit", "q"]: 722 776 print("\nGoodbye!") 723 777 break 724 778 ··· 729 783 conversation_history.append({"role": "user", "content": user_input}) 730 784 731 785 # Format conversation for the model using chat template if available 732 - prompt = self._format_conversation(conversation_history, use_chat_template=use_chat_template) 786 + prompt = self._format_conversation( 787 + conversation_history, use_chat_template=use_chat_template 788 + ) 733 789 734 790 # Generate response with streaming 735 791 print("\nAssistant: ", end="", flush=True) ··· 751 807 752 808 # Add assistant response to history 753 809 assistant_response = "".join(response_tokens).strip() 754 - conversation_history.append({"role": "assistant", "content": assistant_response}) 810 + conversation_history.append( 811 + {"role": "assistant", "content": assistant_response} 812 + ) 755 813 756 814 print() # New line after response 757 815 ··· 762 820 print(f"\n[ERROR] {e}") 763 821 continue 764 822 765 - def _format_conversation(self, messages: list, use_chat_template: bool = True) -> str: 823 + def _format_conversation( 824 + self, messages: list, use_chat_template: bool = True 825 + ) -> str: 766 826 """Format conversation history into a prompt. 767 - 827 + 768 828 Uses the tokenizer's chat template if available, otherwise falls back 769 829 to the legacy Human:/Assistant: format for compatibility. 770 - 830 + 771 831 Args: 772 832 messages: List of message dictionaries with 'role' and 'content' 773 833 use_chat_template: Whether to use chat template if available 774 - 834 + 775 835 Returns: 776 836 Formatted conversation string 777 837 """ 778 838 # Try to use native chat template if available 779 - if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template: 839 + if ( 840 + use_chat_template 841 + and hasattr(self.tokenizer, "chat_template") 842 + and self.tokenizer.chat_template 843 + ): 780 844 try: 781 845 # Apply the tokenizer's chat template 782 846 formatted_prompt = self.tokenizer.apply_chat_template( 783 - messages, 784 - tokenize=False, 785 - add_generation_prompt=True 847 + messages, tokenize=False, add_generation_prompt=True 786 848 ) 787 849 return formatted_prompt 788 850 except Exception as e: 789 851 # If chat template fails, fall back to legacy format 790 852 if self.verbose: 791 853 print(f"[WARNING] Chat template failed, using legacy format: {e}") 792 - 854 + 793 855 # Legacy format fallback for compatibility 794 856 return self._legacy_format_conversation(messages) 795 - 857 + 796 858 def _legacy_format_conversation(self, messages: list) -> str: 797 859 """Legacy conversation formatting for backward compatibility. 798 - 860 + 799 861 This format was used in earlier versions and remains as a fallback 800 862 for models without chat templates. 801 863 """ ··· 819 881 820 882 def get_memory_usage(self) -> Dict[str, float]: 821 883 """Get current memory usage statistics. 822 - 884 + 823 885 Returns: 824 886 Dictionary with memory statistics in GB 825 887 """ ··· 834 896 return { 835 897 "current_gb": current_memory, 836 898 "peak_gb": peak_memory, 837 - "model_gb": current_memory - self._memory_baseline if self._memory_baseline else 0, 899 + "model_gb": ( 900 + current_memory - self._memory_baseline if self._memory_baseline else 0 901 + ), 838 902 } 839 903 840 904 def _format_reasoning_response(self, response: str) -> str: 841 905 """Format response from reasoning models for better readability. 842 - 906 + 843 907 For MXFP4 models that generate reasoning followed by final answer, 844 908 format it nicely for display. 845 909 """ 846 910 if not self._is_reasoning_model: 847 911 return response 848 - 912 + 849 913 # Check if response contains reasoning markers 850 914 if self._reasoning_start in response and self._final_start in response: 851 915 # Extract reasoning and final parts 852 916 try: 853 917 # Split on the reasoning start 854 918 before_reasoning, after_start = response.split(self._reasoning_start, 1) 855 - 919 + 856 920 # Find the reasoning content (until <|end|>) 857 921 if self._reasoning_end in after_start: 858 - reasoning_content, after_reasoning = after_start.split(self._reasoning_end, 1) 859 - 922 + reasoning_content, after_reasoning = after_start.split( 923 + self._reasoning_end, 1 924 + ) 925 + 860 926 # Find the final answer 861 927 if self._final_start in after_reasoning: 862 928 # Extract everything after final marker 863 929 final_parts = after_reasoning.split(self._final_start, 1) 864 930 if len(final_parts) > 1: 865 931 # Remove the <|channel|>final<|message|> marker 866 - final_answer = final_parts[1].replace('<|channel|>final<|message|>', '', 1) 867 - 932 + final_answer = final_parts[1].replace( 933 + "<|channel|>final<|message|>", "", 1 934 + ) 935 + 868 936 # Format with clear markers for parsing but minimal visual impact 869 937 formatted = [] 870 938 formatted.append("\n**[Reasoning]**\n") 871 939 formatted.append(reasoning_content.strip()) 872 940 formatted.append("\n\n---\n\n**[Answer]**\n") 873 941 formatted.append(final_answer.strip()) 874 - 875 - return '\n'.join(formatted) 942 + 943 + return "\n".join(formatted) 876 944 except Exception: 877 945 # If parsing fails, return original 878 946 pass 879 - 947 + 880 948 # Fallback: just clean up the control tokens 881 949 cleaned = response 882 - for marker in ['<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant', 883 - '<|channel|>final<|message|>', '<|return|>']: 884 - cleaned = cleaned.replace(marker, '') 885 - 950 + for marker in [ 951 + "<|channel|>analysis<|message|>", 952 + "<|end|>", 953 + "<|start|>assistant", 954 + "<|channel|>final<|message|>", 955 + "<|return|>", 956 + ]: 957 + cleaned = cleaned.replace(marker, "") 958 + 886 959 return cleaned.strip() 887 - 888 - def _filter_end_tokens_from_response(self, response: str, use_chat_stop_tokens: bool = False) -> str: 960 + 961 + def _filter_end_tokens_from_response( 962 + self, response: str, use_chat_stop_tokens: bool = False 963 + ) -> str: 889 964 """Filter end tokens from a complete response (batch mode). 890 - 965 + 891 966 This method applies the same filtering logic as the streaming mode 892 967 to ensure consistent behavior between streaming and non-streaming. 893 - 968 + 894 969 Args: 895 970 response: The complete generated response 896 971 use_chat_stop_tokens: Whether to apply chat stop tokens 897 - 972 + 898 973 Returns: 899 974 Response with end tokens filtered out 900 975 """ ··· 906 981 stop_pos = response.find(stop_token) 907 982 filtered_response = response[:stop_pos].rstrip() 908 983 if self.verbose: 909 - print(f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}") 984 + print( 985 + f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}" 986 + ) 910 987 return filtered_response 911 - 988 + 912 989 # Only check chat stop tokens if no native stop token found (fallback) 913 990 if use_chat_stop_tokens and self._chat_stop_tokens: 914 991 for stop_token in self._chat_stop_tokens: ··· 916 993 # Find the stop token position and return everything before it 917 994 stop_pos = response.find(stop_token) 918 995 return response[:stop_pos] 919 - 996 + 920 997 # No stop tokens found, return original response 921 998 return response 922 999 923 1000 924 1001 def get_gpu_status() -> Dict[str, float]: 925 1002 """Independent GPU status check - usable from anywhere. 926 - 1003 + 927 1004 Returns: 928 1005 Dictionary with GPU memory statistics in GB 929 1006 """ ··· 935 1012 936 1013 def check_memory_available(required_gb: float) -> bool: 937 1014 """Pre-flight check before model loading. 938 - 1015 + 939 1016 Args: 940 1017 required_gb: Required memory in GB 941 - 1018 + 942 1019 Returns: 943 1020 True if memory is likely available (conservative estimate) 944 1021 """ ··· 966 1043 verbose: bool = False, 967 1044 ) -> Optional[str]: 968 1045 """Enhanced run function with direct MLX integration. 969 - 1046 + 970 1047 Uses context manager pattern for automatic resource cleanup. 971 - 1048 + 972 1049 Args: 973 1050 model_path: Path to the MLX model 974 1051 prompt: Input prompt (if None, enters interactive mode) ··· 978 1055 top_p: Top-p sampling parameter 979 1056 repetition_penalty: Penalty for repeated tokens 980 1057 stream: Whether to stream output 981 - 1058 + 982 1059 Returns: 983 1060 Generated text (in non-interactive mode) 984 1061 """ ··· 1038 1115 # Show memory usage if verbose 1039 1116 if verbose: 1040 1117 memory_stats = runner.get_memory_usage() 1041 - print(f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total") 1118 + print( 1119 + f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total" 1120 + ) 1042 1121 1043 1122 return response 1044 1123 ··· 1047 1126 except Exception as e: 1048 1127 print(f"\n[ERROR] {e}") 1049 1128 return None 1050 -
+27 -43
server/model_card.py
··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 1 from __future__ import annotations 23 2 24 3 # ruff: noqa: UP045 ··· 45 24 def _latest_snapshot_dir(model_base_dir: Path) -> Optional[Path]: 46 25 """Return latest snapshot directory for a cached HF model base dir.""" 47 26 try: 48 - snaps = (model_base_dir / "snapshots") 27 + snaps = model_base_dir / "snapshots" 49 28 if not snaps.exists(): 50 29 return None 51 30 candidates = [d for d in snaps.iterdir() if d.is_dir()] ··· 73 52 """ 74 53 start = text.find("\n---\n") 75 54 # Accept files starting directly with '---' too 76 - if text.startswith('---'): 55 + if text.startswith("---"): 77 56 start = 0 78 57 elif start >= 0: 79 58 start = start + 1 # move to line start 80 59 else: 81 60 # Try at very beginning without newline 82 - start = 0 if text[:3] == '---' else -1 61 + start = 0 if text[:3] == "---" else -1 83 62 if start != 0: 84 63 return {} 85 64 86 65 # Find closing '---' after start 87 - end = text.find('\n---', 3) 66 + end = text.find("\n---", 3) 88 67 if end == -1: 89 68 return {} 90 - header = text[3:end] if text.startswith('---') else text[start + 3:end] 69 + header = text[3:end] if text.startswith("---") else text[start + 3 : end] 91 70 92 71 # Normalize lines 93 72 lines = [ln.strip() for ln in header.splitlines() if ln.strip()] ··· 103 82 list_acc = [] 104 83 105 84 for ln in lines: 106 - if ln.startswith('- '): 85 + if ln.startswith("- "): 107 86 # list item under current_key 108 - val = ln[2:].strip().strip('"\'') 87 + val = ln[2:].strip().strip("\"'") 109 88 if current_key is not None: 110 89 list_acc.append(val) 111 90 continue 112 91 # key: value or key: [a, b] 113 - if ':' in ln: 92 + if ":" in ln: 114 93 # Close any previous list 115 94 flush_list() 116 - key, val = ln.split(':', 1) 95 + key, val = ln.split(":", 1) 117 96 key = key.strip() 118 97 val = val.strip() 119 98 current_key = key ··· 122 101 data.setdefault(key, []) 123 102 continue 124 103 # Inline list [a, b] 125 - if val.startswith('[') and val.endswith(']'): 104 + if val.startswith("[") and val.endswith("]"): 126 105 inner = val[1:-1].strip() 127 - items = [] if not inner else [it.strip().strip('"\'') for it in inner.split(',')] 106 + items = ( 107 + [] 108 + if not inner 109 + else [it.strip().strip("\"'") for it in inner.split(",")] 110 + ) 128 111 data[key] = [x for x in items if x] 129 112 continue 130 113 # Scalar 131 - data[key] = val.strip('"\'') 114 + data[key] = val.strip("\"'") 132 115 continue 133 116 # Non key-value, ignore 134 117 # Flush last list ··· 136 119 return data 137 120 138 121 139 - def read_readme_front_matter(model_base_dir: Path) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]: 122 + def read_readme_front_matter( 123 + model_base_dir: Path, 124 + ) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]: 140 125 """Read README.md front matter and extract tags, pipeline_tag, library_name. 141 126 142 127 Returns (tags, pipeline_tag, library_name) with lowercase normalization where applicable. ··· 146 131 snap = _latest_snapshot_dir(model_base_dir) 147 132 if not snap: 148 133 return None, None, None 149 - readme = snap / 'README.md' 134 + readme = snap / "README.md" 150 135 if not readme.exists(): 151 136 return None, None, None 152 - text = readme.read_text(encoding='utf-8', errors='ignore') 137 + text = readme.read_text(encoding="utf-8", errors="ignore") 153 138 fm = _lenient_yaml_front_matter(text) 154 139 if not fm: 155 140 return None, None, None 156 - tags = fm.get('tags') 141 + tags = fm.get("tags") 157 142 if isinstance(tags, list): 158 143 tags = [str(t).strip().lower() for t in tags if str(t).strip()] 159 144 else: 160 145 tags = None 161 - pipeline = fm.get('pipeline_tag') 146 + pipeline = fm.get("pipeline_tag") 162 147 pipeline = str(pipeline).strip().lower() if pipeline else None 163 - lib = fm.get('library_name') 148 + lib = fm.get("library_name") 164 149 lib = str(lib).strip().lower() if lib else None 165 150 return tags, pipeline, lib 166 151 except Exception: ··· 173 158 snap = _latest_snapshot_dir(model_base_dir) 174 159 if not snap: 175 160 return False 176 - tk = snap / 'tokenizer_config.json' 161 + tk = snap / "tokenizer_config.json" 177 162 if not tk.exists(): 178 163 return False 179 - with open(tk, encoding='utf-8') as f: 164 + with open(tk, encoding="utf-8") as f: 180 165 data = json.load(f) 181 - tmpl = data.get('chat_template') 166 + tmpl = data.get("chat_template") 182 167 return bool(tmpl and isinstance(tmpl, str) and tmpl.strip()) 183 168 except Exception: 184 169 return False 185 -
+3
server/pyproject.toml
··· 14 14 [build-system] 15 15 requires = ["setuptools", "wheel"] 16 16 build-backend = "setuptools.build_meta" 17 + 18 + [tool.setuptools.packages.find] 19 + exclude = ["backend", "backend.*"]
+5
server/pyrightconfig.json
··· 1 + { 2 + "venvPath": ".", 3 + "venv": ".venv" 4 + } 5 +
+172 -165
server/reasoning_utils.py
··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 1 """ 23 2 Utilities for handling reasoning models and their output. 24 3 ··· 35 14 36 15 class ReasoningExtractor: 37 16 """Extract reasoning and final answer from model outputs.""" 38 - 17 + 39 18 # Model-specific patterns 40 19 PATTERNS = { 41 - 'gpt-oss': { 42 - 'reasoning': r'<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>', 43 - 'final': r'<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)', 44 - 'markers': { 45 - 'reasoning_start': '<|channel|>analysis<|message|>', 46 - 'reasoning_end': '<|end|>', 47 - 'final_marker': '<|channel|>final<|message|>', 20 + "gpt-oss": { 21 + "reasoning": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>", 22 + "final": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", 23 + "markers": { 24 + "reasoning_start": "<|channel|>analysis<|message|>", 25 + "reasoning_end": "<|end|>", 26 + "final_marker": "<|channel|>final<|message|>", 48 27 # Skip tokens that appear between reasoning and final 49 - 'skip_tokens': ['<|start|>assistant<|channel|>final<|message|>', '<|start|>assistant', '<|start|>', '<|channel|>final<|message|>'], 28 + "skip_tokens": [ 29 + "<|start|>assistant<|channel|>final<|message|>", 30 + "<|start|>assistant", 31 + "<|start|>", 32 + "<|channel|>final<|message|>", 33 + ], 50 34 # Conditional skip tokens - only skip if at start of final section 51 - 'conditional_skip': ['assistant'] 52 - } 35 + "conditional_skip": ["assistant"], 36 + }, 53 37 }, 54 - 'deepseek': { 55 - 'reasoning': r'<think>(.*?)</think>', 56 - 'final': r'</think>(.*?)$', 57 - 'markers': { 58 - 'reasoning_start': '<think>', 59 - 'reasoning_end': '</think>', 60 - } 38 + "deepseek": { 39 + "reasoning": r"<think>(.*?)</think>", 40 + "final": r"</think>(.*?)$", 41 + "markers": { 42 + "reasoning_start": "<think>", 43 + "reasoning_end": "</think>", 44 + }, 61 45 }, 62 - 'claude': { 63 - 'reasoning': r'<thinking>(.*?)</thinking>', 64 - 'final': r'</thinking>(.*?)$', 65 - 'markers': { 66 - 'reasoning_start': '<thinking>', 67 - 'reasoning_end': '</thinking>', 68 - } 69 - } 46 + "claude": { 47 + "reasoning": r"<thinking>(.*?)</thinking>", 48 + "final": r"</thinking>(.*?)$", 49 + "markers": { 50 + "reasoning_start": "<thinking>", 51 + "reasoning_end": "</thinking>", 52 + }, 53 + }, 70 54 } 71 - 55 + 72 56 @classmethod 73 57 def detect_model_type(cls, model_name: str) -> Optional[str]: 74 58 """Detect reasoning model type from model name.""" 75 59 model_lower = model_name.lower() 76 - 77 - if 'gpt-oss' in model_lower: 78 - return 'gpt-oss' 79 - elif 'deepseek' in model_lower and 'r1' in model_lower: 80 - return 'deepseek' 81 - elif 'claude' in model_lower: 82 - return 'claude' 83 - elif 'qwq' in model_lower: 84 - return 'gpt-oss' # QwQ uses similar format to GPT-OSS 85 - 60 + 61 + if "gpt-oss" in model_lower: 62 + return "gpt-oss" 63 + elif "deepseek" in model_lower and "r1" in model_lower: 64 + return "deepseek" 65 + elif "claude" in model_lower: 66 + return "claude" 67 + elif "qwq" in model_lower: 68 + return "gpt-oss" # QwQ uses similar format to GPT-OSS 69 + 86 70 return None 87 - 71 + 88 72 @classmethod 89 - def extract(cls, text: str, model_type: Optional[str] = None, 90 - model_name: Optional[str] = None) -> Dict[str, Optional[str]]: 73 + def extract( 74 + cls, 75 + text: str, 76 + model_type: Optional[str] = None, 77 + model_name: Optional[str] = None, 78 + ) -> Dict[str, Optional[str]]: 91 79 """ 92 80 Extract reasoning and final answer from model output. 93 - 81 + 94 82 Args: 95 83 text: The full model output 96 84 model_type: Explicit model type ('mxfp4', 'deepseek', etc.) 97 85 model_name: Model name to auto-detect type 98 - 86 + 99 87 Returns: 100 88 Dictionary with 'reasoning', 'final_answer', and 'full_response' 101 89 """ 102 90 # Auto-detect model type if not provided 103 91 if not model_type and model_name: 104 92 model_type = cls.detect_model_type(model_name) 105 - 93 + 106 94 # If no model type detected, return text as-is 107 95 if not model_type or model_type not in cls.PATTERNS: 108 96 return { 109 - 'reasoning': None, 110 - 'final_answer': text, 111 - 'full_response': text, 112 - 'has_reasoning': False 97 + "reasoning": None, 98 + "final_answer": text, 99 + "full_response": text, 100 + "has_reasoning": False, 113 101 } 114 - 102 + 115 103 patterns = cls.PATTERNS[model_type] 116 - 104 + 117 105 # Extract reasoning 118 - reasoning_match = re.search(patterns['reasoning'], text, re.DOTALL) 106 + reasoning_match = re.search(patterns["reasoning"], text, re.DOTALL) 119 107 reasoning = reasoning_match.group(1).strip() if reasoning_match else None 120 - 108 + 121 109 # Extract final answer 122 - final_match = re.search(patterns['final'], text, re.DOTALL) 110 + final_match = re.search(patterns["final"], text, re.DOTALL) 123 111 final_answer = final_match.group(1).strip() if final_match else None 124 - 112 + 125 113 # If no final answer found but we have reasoning, 126 114 # the text after reasoning might be the answer 127 115 if reasoning and not final_answer: 128 116 # Try to find text after reasoning markers 129 - markers = patterns.get('markers', {}) 130 - if 'reasoning_end' in markers: 131 - split_text = text.split(markers['reasoning_end'], 1) 117 + markers = patterns.get("markers", {}) 118 + if "reasoning_end" in markers: 119 + split_text = text.split(markers["reasoning_end"], 1) 132 120 if len(split_text) > 1: 133 121 # Clean up any remaining markers 134 122 remaining = split_text[1] 135 123 for marker in markers.values(): 136 - remaining = remaining.replace(marker, '') 124 + remaining = remaining.replace(marker, "") 137 125 final_answer = remaining.strip() 138 - 126 + 139 127 # If still no final answer, use full text minus reasoning markers 140 128 if not final_answer: 141 129 final_answer = text 142 130 # Remove all known markers 143 131 if model_type in cls.PATTERNS: 144 - markers = cls.PATTERNS[model_type].get('markers', {}) 132 + markers = cls.PATTERNS[model_type].get("markers", {}) 145 133 for marker in markers.values(): 146 - final_answer = final_answer.replace(marker, '') 134 + final_answer = final_answer.replace(marker, "") 147 135 final_answer = final_answer.strip() 148 - 136 + 149 137 return { 150 - 'reasoning': reasoning, 151 - 'final_answer': final_answer, 152 - 'full_response': text, 153 - 'has_reasoning': bool(reasoning), 154 - 'model_type': model_type 138 + "reasoning": reasoning, 139 + "final_answer": final_answer, 140 + "full_response": text, 141 + "has_reasoning": bool(reasoning), 142 + "model_type": model_type, 155 143 } 156 - 144 + 157 145 @classmethod 158 - def format_for_display(cls, extracted: Dict[str, Optional[str]], 159 - show_reasoning: bool = False) -> str: 146 + def format_for_display( 147 + cls, extracted: Dict[str, Optional[str]], show_reasoning: bool = False 148 + ) -> str: 160 149 """ 161 150 Format extracted content for display. 162 - 151 + 163 152 Args: 164 153 extracted: Output from extract() 165 154 show_reasoning: Whether to include reasoning in output 166 - 155 + 167 156 Returns: 168 157 Formatted string for display 169 158 """ 170 - if not extracted.get('has_reasoning'): 171 - return extracted.get('final_answer', '') 172 - 159 + if not extracted.get("has_reasoning"): 160 + return extracted.get("final_answer", "") 161 + 173 162 if show_reasoning: 174 163 output = [] 175 - if extracted.get('reasoning'): 164 + if extracted.get("reasoning"): 176 165 output.append("═══ Reasoning ═══") 177 - output.append(extracted['reasoning']) 166 + output.append(extracted["reasoning"]) 178 167 output.append("\n═══ Answer ═══") 179 - output.append(extracted.get('final_answer', '')) 180 - return '\n'.join(output) 168 + output.append(extracted.get("final_answer", "")) 169 + return "\n".join(output) 181 170 else: 182 - return extracted.get('final_answer', '') 171 + return extracted.get("final_answer", "") 183 172 184 173 185 174 class StreamingReasoningHandler: 186 175 """Handle reasoning during streaming generation.""" 187 - 176 + 188 177 def __init__(self, model_type: Optional[str] = None): 189 178 self.model_type = model_type 190 179 self.buffer = "" ··· 193 182 self.in_reasoning = False 194 183 self.in_final = False 195 184 self.markers = {} 196 - 185 + 197 186 if model_type and model_type in ReasoningExtractor.PATTERNS: 198 - self.markers = ReasoningExtractor.PATTERNS[model_type].get('markers', {}) 199 - 187 + self.markers = ReasoningExtractor.PATTERNS[model_type].get("markers", {}) 188 + 200 189 def process_token(self, token: str) -> Tuple[str, bool]: 201 190 """ 202 191 Process a streaming token. 203 - 192 + 204 193 Args: 205 194 token: The new token 206 - 195 + 207 196 Returns: 208 197 (output_token, should_display) - token to output and whether to display it 209 198 """ 210 199 self.buffer += token 211 - 200 + 212 201 # Check for reasoning start 213 - if not self.in_reasoning and self.markers.get('reasoning_start'): 214 - if self.markers['reasoning_start'] in self.buffer: 202 + if not self.in_reasoning and self.markers.get("reasoning_start"): 203 + if self.markers["reasoning_start"] in self.buffer: 215 204 self.in_reasoning = True 216 - self.reasoning_buffer = self.buffer.split(self.markers['reasoning_start'])[1] 205 + self.reasoning_buffer = self.buffer.split( 206 + self.markers["reasoning_start"] 207 + )[1] 217 208 return ("", False) # Don't display reasoning start marker 218 - 209 + 219 210 # If in reasoning, buffer it 220 211 if self.in_reasoning: 221 212 self.reasoning_buffer += token 222 - 213 + 223 214 # Check for reasoning end 224 - if self.markers.get('reasoning_end') and self.markers['reasoning_end'] in self.reasoning_buffer: 215 + if ( 216 + self.markers.get("reasoning_end") 217 + and self.markers["reasoning_end"] in self.reasoning_buffer 218 + ): 225 219 self.in_reasoning = False 226 220 self.in_final = True 227 221 # Clean up reasoning buffer 228 - self.reasoning_buffer = self.reasoning_buffer.replace(self.markers['reasoning_end'], '') 222 + self.reasoning_buffer = self.reasoning_buffer.replace( 223 + self.markers["reasoning_end"], "" 224 + ) 229 225 return ("", False) # Don't display reasoning end marker 230 - 226 + 231 227 return ("", False) # Don't display reasoning content by default 232 - 228 + 233 229 # If in final answer section 234 230 if self.in_final: 235 231 # Skip final answer markers 236 - if self.markers.get('final_marker') and self.markers['final_marker'] in token: 232 + if ( 233 + self.markers.get("final_marker") 234 + and self.markers["final_marker"] in token 235 + ): 237 236 return ("", False) 238 - 237 + 239 238 self.final_buffer += token 240 239 return (token, True) # Display final answer 241 - 240 + 242 241 # Default: display token if not in special section 243 242 return (token, True) 244 243 245 244 246 245 class StreamingReasoningParser: 247 246 """Parser for real-time streaming with reasoning model formatting.""" 248 - 247 + 249 248 def __init__(self, model_type: Optional[str] = None, hide_reasoning: bool = False): 250 249 self.model_type = model_type 251 250 self.hide_reasoning = hide_reasoning ··· 253 252 self.buffer = "" 254 253 self.reasoning_content = "" 255 254 self.patterns = {} 256 - 255 + 257 256 if model_type and model_type in ReasoningExtractor.PATTERNS: 258 - self.patterns = ReasoningExtractor.PATTERNS[model_type].get('markers', {}) 259 - 257 + self.patterns = ReasoningExtractor.PATTERNS[model_type].get("markers", {}) 258 + 260 259 def process_token(self, token: str): 261 260 """ 262 261 Process a streaming token and yield formatted output. 263 - 262 + 264 263 Args: 265 264 token: New token from model 266 - 265 + 267 266 Yields: 268 267 Formatted output tokens for display 269 268 """ 270 269 self.buffer += token 271 - 270 + 272 271 # State: WAITING - looking for reasoning start 273 272 if self.state == "WAITING": 274 - reasoning_start = self.patterns.get('reasoning_start') 273 + reasoning_start = self.patterns.get("reasoning_start") 275 274 if reasoning_start and reasoning_start in self.buffer: 276 275 # Found reasoning start 277 276 before_reasoning = self.buffer.split(reasoning_start, 1)[0] 278 - 277 + 279 278 # Yield any content before reasoning (but not control tokens) 280 - if before_reasoning.strip() and not before_reasoning.strip().startswith('<|'): 279 + if before_reasoning.strip() and not before_reasoning.strip().startswith( 280 + "<|" 281 + ): 281 282 yield before_reasoning 282 - 283 + 283 284 # Start reasoning section (only if not hiding reasoning) 284 285 if not self.hide_reasoning: 285 286 yield "**[Reasoning]**\n\n" 286 - 287 + 287 288 # Switch to reasoning state 288 289 self.buffer = self.buffer.split(reasoning_start, 1)[1] 289 290 self.state = "IN_REASONING" 290 - 291 + 291 292 # Process remaining buffer recursively 292 293 if self.buffer.strip(): 293 294 yield from self.process_token("") 294 295 return 295 - 296 + 296 297 # Check if buffer might contain start of reasoning pattern 297 298 if reasoning_start: 298 299 # Check if buffer ends with partial pattern ··· 301 302 if self.buffer.endswith(reasoning_start[:i]): 302 303 has_partial_match = True 303 304 break 304 - 305 + 305 306 if has_partial_match: 306 307 # Don't yield yet - might be building up to pattern 307 308 return 308 - 309 + 309 310 # No partial match, safe to yield older content 310 311 # Keep enough buffer to detect pattern 311 312 pattern_len = len(reasoning_start) ··· 315 316 if to_yield: 316 317 yield to_yield 317 318 return 318 - 319 + 319 320 # No reasoning pattern expected or very short buffer 320 321 if not reasoning_start: 321 322 yield token 322 - 323 + 323 324 # State: IN_REASONING - collecting reasoning content 324 325 elif self.state == "IN_REASONING": 325 - reasoning_end = self.patterns.get('reasoning_end') 326 + reasoning_end = self.patterns.get("reasoning_end") 326 327 if reasoning_end and reasoning_end in self.buffer: 327 328 # Found reasoning end 328 329 reasoning_part = self.buffer.split(reasoning_end, 1)[0] 329 - 330 + 330 331 # Yield reasoning content (only if not hiding reasoning) 331 332 if reasoning_part and not self.hide_reasoning: 332 333 yield reasoning_part 333 - 334 + 334 335 # Add separator (only if not hiding reasoning) 335 336 if not self.hide_reasoning: 336 337 yield "\n\n---\n\n**[Answer]**\n\n" 337 - 338 + 338 339 # Switch to final state 339 340 self.buffer = self.buffer.split(reasoning_end, 1)[1] 340 341 self.state = "IN_FINAL" 341 - self._final_content_started = False # Track if we've started outputting final content 342 - 342 + self._final_content_started = ( 343 + False # Track if we've started outputting final content 344 + ) 345 + 343 346 # Skip intermediate control tokens 344 - skip_tokens = self.patterns.get('skip_tokens', []) 347 + skip_tokens = self.patterns.get("skip_tokens", []) 345 348 for skip_token in skip_tokens: 346 - self.buffer = self.buffer.replace(skip_token, '') 347 - 349 + self.buffer = self.buffer.replace(skip_token, "") 350 + 348 351 # Skip final marker when we find it 349 - final_marker = self.patterns.get('final_marker') 352 + final_marker = self.patterns.get("final_marker") 350 353 if final_marker and final_marker in self.buffer: 351 354 self.buffer = self.buffer.split(final_marker, 1)[1] 352 - 355 + 353 356 # Process remaining buffer 354 357 if self.buffer.strip(): 355 358 yield from self.process_token("") 356 359 return 357 - 360 + 358 361 # Still in reasoning, yield the content (only if not hiding reasoning) 359 362 if not self.hide_reasoning: 360 363 yield token 361 - 364 + 362 365 # State: IN_FINAL - normal streaming of final answer 363 366 elif self.state == "IN_FINAL": 364 367 # Check for control tokens from patterns that should be filtered 365 - skip_tokens = self.patterns.get('skip_tokens', []) 366 - conditional_skip = self.patterns.get('conditional_skip', []) 367 - 368 + skip_tokens = self.patterns.get("skip_tokens", []) 369 + conditional_skip = self.patterns.get("conditional_skip", []) 370 + 368 371 # Check if buffer contains any skip tokens and filter them out 369 372 for skip_token in skip_tokens: 370 373 if skip_token in self.buffer: 371 374 # Remove the skip token and continue 372 - self.buffer = self.buffer.replace(skip_token, '') 375 + self.buffer = self.buffer.replace(skip_token, "") 373 376 # Process remaining buffer if any 374 377 if self.buffer.strip(): 375 378 yield from self.process_token("") 376 379 return 377 - 380 + 378 381 # Check for final marker and filter it too 379 - final_marker = self.patterns.get('final_marker') 382 + final_marker = self.patterns.get("final_marker") 380 383 if final_marker and final_marker in self.buffer: 381 384 # Split at final marker and yield only content after it 382 385 parts = self.buffer.split(final_marker, 1) ··· 388 391 else: 389 392 # Just the marker itself, skip it 390 393 return 391 - 394 + 392 395 # Check conditional skip tokens - only at start of final section 393 - if not getattr(self, '_final_content_started', False): 396 + if not getattr(self, "_final_content_started", False): 394 397 for cond_token in conditional_skip: 395 398 if token.strip() == cond_token: 396 399 # Skip this token at the beginning of final section 397 400 return 398 401 # Mark that final content has started after first non-conditional token 399 - if token.strip() and not any(token.strip() == ct for ct in conditional_skip): 402 + if token.strip() and not any( 403 + token.strip() == ct for ct in conditional_skip 404 + ): 400 405 self._final_content_started = True 401 - 406 + 402 407 # Check if we might be building up to a skip token - be conservative 403 408 potential_skip = False 404 409 for skip_token in skip_tokens: 405 - if skip_token.startswith(token) or any(skip_token.startswith(self.buffer[-i:]) for i in range(1, min(len(skip_token), len(self.buffer)) + 1)): 410 + if skip_token.startswith(token) or any( 411 + skip_token.startswith(self.buffer[-i:]) 412 + for i in range(1, min(len(skip_token), len(self.buffer)) + 1) 413 + ): 406 414 potential_skip = True 407 415 break 408 - 416 + 409 417 if potential_skip: 410 418 # Don't yield yet, might be building up to a skip token 411 419 return 412 - 420 + 413 421 # Normal token in final answer - safe to yield 414 422 yield token 415 - 423 + 416 424 def finalize(self): 417 425 """ 418 426 Finalize parsing and yield any remaining buffer content. ··· 428 436 elif self.state == "IN_FINAL": 429 437 # Final answer content 430 438 yield self.buffer 431 -
+1
server/runtime.py
··· 1 + backend = None
+65
server/schemas.py
··· 1 + from pydantic import BaseModel, Field 2 + from typing import Any, Dict, List, Optional, Union 3 + 4 + class CompletionRequest(BaseModel): 5 + model: str 6 + prompt: Union[str, List[str]] 7 + max_tokens: Optional[int] = None 8 + temperature: Optional[float] = 0.7 9 + top_p: Optional[float] = 0.9 10 + stream: Optional[bool] = False 11 + stop: Optional[Union[str, List[str]]] = None 12 + repetition_penalty: Optional[float] = 1.1 13 + 14 + 15 + class ChatMessage(BaseModel): 16 + role: str = Field(..., pattern="^(system|user|assistant)$") 17 + content: str 18 + 19 + 20 + class ChatCompletionRequest(BaseModel): 21 + model: str 22 + messages: List[ChatMessage] 23 + chat_start: bool 24 + python_code: str 25 + max_tokens: Optional[int] = None 26 + temperature: Optional[float] = 0.7 27 + top_p: Optional[float] = 0.9 28 + stream: Optional[bool] = False 29 + stop: Optional[Union[str, List[str]]] = None 30 + repetition_penalty: Optional[float] = 1.1 31 + 32 + 33 + class CompletionResponse(BaseModel): 34 + id: str 35 + object: str = "text_completion" 36 + created: int 37 + model: str 38 + choices: List[Dict[str, Any]] 39 + usage: Dict[str, int] 40 + 41 + 42 + class ChatCompletionResponse(BaseModel): 43 + id: str 44 + object: str = "chat.completion" 45 + created: int 46 + model: str 47 + choices: List[Dict[str, Any]] 48 + # usage: Dict[str, int] 49 + 50 + 51 + class ModelInfo(BaseModel): 52 + id: str 53 + object: str = "model" 54 + owned_by: str = "mlx-knife" 55 + permission: List = [] 56 + context_length: Optional[int] = None 57 + 58 + 59 + class StartRequest(BaseModel): 60 + model: str 61 + memory_path: str 62 + 63 + 64 + class downloadRequest(BaseModel): 65 + model: str
-41
src/commands/mod.rs
··· 1 - // Module that handles CLI commands 2 - 3 - use anyhow::Result; 4 - use tiles::{ 5 - core::{ 6 - health, 7 - modelfile::{self, Modelfile}, 8 - }, 9 - runner::mlx, 10 - }; 11 - 12 - const DEFAULT_MODELFILE: &str = " 13 - FROM driaforall/mem-agent-mlx-4bit 14 - "; 15 - 16 - pub async fn run(modelfile: Option<String>) { 17 - let modelfile_parse_result: Result<Modelfile, String> = if let Some(modelfile_str) = modelfile { 18 - modelfile::parse_from_file(modelfile_str.as_str()) 19 - } else { 20 - modelfile::parse(DEFAULT_MODELFILE) 21 - }; 22 - 23 - match modelfile_parse_result { 24 - Ok(modelfile) => { 25 - mlx::run(modelfile).await; 26 - } 27 - Err(err) => println!("{}", err), 28 - } 29 - } 30 - 31 - pub fn check_health() { 32 - health::check_health(); 33 - } 34 - 35 - pub async fn start_server() { 36 - let _ = mlx::start_server_daemon().await; 37 - } 38 - 39 - pub fn stop_server() { 40 - let _ = mlx::stop_server_daemon(); 41 - }
src/core/health.rs tiles/src/core/health.rs
-2
src/core/mod.rs
··· 1 - pub mod health; 2 - pub mod modelfile;
src/core/modelfile.rs tilekit/src/modelfile.rs
+1 -1
src/lib.rs tiles/src/lib.rs
··· 1 1 pub mod core; 2 - pub mod runner; 2 + pub mod runtime; 3 3 4 4 #[cfg(test)] 5 5 mod tests {}
+6 -4
src/main.rs tiles/src/main.rs
··· 1 1 use std::error::Error; 2 2 3 3 use clap::{Args, Parser, Subcommand}; 4 + use tiles::runtime::build_runtime; 4 5 mod commands; 5 6 #[derive(Debug, Parser)] 6 7 #[command(name = "tiles")] ··· 38 39 /// Stops the daemon py server 39 40 Stop, 40 41 } 41 - #[tokio::main] 42 + #[tokio::main(flavor = "current_thread")] 42 43 pub async fn main() -> Result<(), Box<dyn Error>> { 43 44 let cli = Cli::parse(); 45 + let runtime = build_runtime(); 44 46 match cli.command { 45 47 Commands::Run { modelfile_path } => { 46 - commands::run(modelfile_path).await; 48 + commands::run(&runtime, modelfile_path).await; 47 49 } 48 50 Commands::Health => { 49 51 commands::check_health(); 50 52 } 51 53 Commands::Server(server) => match server.command { 52 - Some(ServerCommands::Start) => commands::start_server().await, 53 - Some(ServerCommands::Stop) => commands::stop_server(), 54 + Some(ServerCommands::Start) => commands::start_server(&runtime).await, 55 + Some(ServerCommands::Stop) => commands::stop_server(&runtime).await, 54 56 _ => println!("Expected start or stop"), 55 57 }, 56 58 }
+85 -77
src/runner/mlx.rs tiles/src/runtime/mlx.rs
··· 1 - use crate::core::modelfile::Modelfile; 2 1 use anyhow::{Context, Result}; 3 2 use futures_util::StreamExt; 4 3 use owo_colors::OwoColorize; ··· 11 10 use std::time::Duration; 12 11 use std::{env, fs}; 13 12 use std::{io, process::Command}; 13 + use tilekit::modelfile::Modelfile; 14 14 use tokio::time::sleep; 15 + 16 + pub struct MLXRuntime {} 17 + 18 + impl MLXRuntime {} 15 19 pub struct ChatResponse { 16 20 // think: String, 17 21 reply: String, 18 22 code: String, 19 23 } 20 24 21 - pub async fn run(modelfile: Modelfile) { 22 - let model = modelfile.from.as_ref().unwrap(); 23 - if model.starts_with("driaforall/mem-agent") { 24 - let _res = run_model_with_server(modelfile).await; 25 - } else { 26 - run_model_by_sub_process(modelfile); 25 + impl Default for MLXRuntime { 26 + fn default() -> Self { 27 + Self::new() 28 + } 29 + } 30 + 31 + impl MLXRuntime { 32 + pub fn new() -> Self { 33 + MLXRuntime {} 34 + } 35 + 36 + pub async fn run(&self, run_args: super::RunArgs) { 37 + let model = run_args.modelfile.from.as_ref().unwrap(); 38 + if model.starts_with("driaforall/mem-agent") { 39 + let _res = run_model_with_server(self, run_args.modelfile).await; 40 + } else { 41 + run_model_by_sub_process(run_args.modelfile); 42 + } 43 + } 44 + 45 + #[allow(clippy::zombie_processes)] 46 + pub async fn start_server_daemon(&self) -> Result<()> { 47 + // check if the server is running 48 + // start server as a child process 49 + // save the pid in a file under ~/.config/tiles/server_pid 50 + 51 + if (ping().await).is_ok() { 52 + println!("server is already up"); 53 + return Ok(()); 54 + } 55 + 56 + let config_dir = get_config_dir()?; 57 + let mut server_dir = get_server_dir()?; 58 + let pid_file = config_dir.join("server.pid"); 59 + fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 60 + 61 + let stdout_log = File::create(config_dir.join("server.out.log"))?; 62 + let stderr_log = File::create(config_dir.join("server.err.log"))?; 63 + let server_path = server_dir.join(".venv/bin/python3"); 64 + server_dir.pop(); 65 + let child = Command::new(server_path) 66 + .args(["-m", "server.main"]) 67 + .current_dir(server_dir) 68 + .stdin(Stdio::null()) 69 + .stdout(Stdio::from(stdout_log)) 70 + .stderr(Stdio::from(stderr_log)) 71 + .spawn() 72 + .expect("failed to start server"); 73 + 74 + fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 75 + std::fs::write(pid_file, child.id().to_string()).unwrap(); 76 + println!("Server started with PID {}", child.id()); 77 + Ok(()) 78 + } 79 + 80 + pub async fn stop_server_daemon(&self) -> Result<()> { 81 + if (ping().await).is_err() { 82 + println!("Server is not running"); 83 + return Ok(()); 84 + } 85 + let pid_file = get_config_dir()?.join("server.pid"); 86 + 87 + if !pid_file.exists() { 88 + eprintln!("server pid doesnt exist"); 89 + return Ok(()); 90 + } 91 + 92 + let pid = std::fs::read_to_string(&pid_file).unwrap(); 93 + Command::new("kill").arg(pid.trim()).status().unwrap(); 94 + std::fs::remove_file(pid_file).unwrap(); 95 + println!("Server stopped."); 96 + Ok(()) 27 97 } 28 98 } 29 99 ··· 82 152 } 83 153 } 84 154 85 - #[allow(clippy::zombie_processes)] 86 - pub async fn start_server_daemon() -> Result<()> { 87 - // check if the server is running 88 - // start server as a child process 89 - // save the pid in a file under ~/.config/tiles/server_pid 90 - 91 - if (ping().await).is_ok() { 92 - println!("server is already up"); 93 - return Ok(()); 94 - } 95 - 96 - let config_dir = get_config_dir()?; 97 - let server_dir = get_server_dir()?; 98 - let pid_file = config_dir.join("server.pid"); 99 - fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 100 - 101 - let stdout_log = File::create(config_dir.join("server.out.log"))?; 102 - let stderr_log = File::create(config_dir.join("server.err.log"))?; 103 - let child = Command::new("uv") 104 - .args([ 105 - "run", 106 - "--project", 107 - server_dir.to_str().unwrap(), 108 - "python", 109 - "-m", 110 - "server.main", 111 - ]) 112 - .stdin(Stdio::null()) 113 - .stdout(Stdio::from(stdout_log)) 114 - .stderr(Stdio::from(stderr_log)) 115 - .spawn() 116 - .expect("failed to start server"); 117 - 118 - fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 119 - std::fs::write(pid_file, child.id().to_string()).unwrap(); 120 - println!("Server started with PID {}", child.id()); 121 - Ok(()) 122 - } 123 - 124 - pub fn stop_server_daemon() -> Result<()> { 125 - let pid_file = get_config_dir()?.join("server.pid"); 126 - 127 - if !pid_file.exists() { 128 - eprintln!("Server is not running"); 129 - return Ok(()); 130 - } 131 - 132 - let pid = std::fs::read_to_string(&pid_file).unwrap(); 133 - Command::new("kill").arg(pid.trim()).status().unwrap(); 134 - std::fs::remove_file(pid_file).unwrap(); 135 - println!("Server stopped."); 136 - Ok(()) 137 - } 138 - async fn run_model_with_server(modelfile: Modelfile) -> reqwest::Result<()> { 155 + async fn run_model_with_server( 156 + mlx_runtime: &MLXRuntime, 157 + modelfile: Modelfile, 158 + ) -> reqwest::Result<()> { 139 159 if !cfg!(debug_assertions) { 140 - let _res = start_server_daemon().await; 160 + let _res = mlx_runtime.start_server_daemon().await; 141 161 let _ = wait_until_server_is_up().await; 142 162 } 143 163 let stdin = io::stdin(); ··· 159 179 match input { 160 180 "exit" => { 161 181 println!("Exiting interactive mode"); 182 + if !cfg!(debug_assertions) { 183 + let _res = mlx_runtime.stop_server_daemon().await; 184 + } 162 185 break; 163 186 } 164 187 _ => { ··· 270 293 271 294 let mut stream = res.bytes_stream(); 272 295 let mut accumulated = String::new(); 273 - // let mut inside_python = false; 274 - // let mut tag_buffer = String::new(); 275 296 println!(); 276 297 while let Some(chunk) = stream.next().await { 277 298 let chunk = chunk.unwrap(); ··· 296 317 } 297 318 } 298 319 } 299 - // println!("{:?}", res); 300 - // if res.status() == 200 { 301 - // let text = res.text().await.unwrap(); 302 - // let v: Value = serde_json::from_str(&text).unwrap(); 303 - // let content = v["choices"][0]["message"]["content"] 304 - // .as_str() 305 - // .unwrap_or("<no content>"); 306 - 307 - // // Ok(convert_to_chat_response(content)) 308 - // } else { 309 - // // Err(String::from("request failed")) 310 - // } 311 - // unimplemented!() 312 320 Err(String::from("request failed")) 313 321 } 314 322
-1
src/runner/mod.rs
··· 1 - pub mod mlx;
+7
tilekit/Cargo.toml
··· 1 + [package] 2 + name = "tilekit" 3 + version = "0.1.0" 4 + edition = "2024" 5 + 6 + [dependencies] 7 + nom = "8"
+1
tilekit/src/lib.rs
··· 1 + pub mod modelfile;
+16
tiles/Cargo.toml
··· 1 + [package] 2 + name = "tiles" 3 + version = "0.3.0" 4 + edition = "2024" 5 + 6 + [dependencies] 7 + tilekit = {path = "../tilekit"} 8 + clap = { version = "4.5.48", features = ["derive"] } 9 + reqwest = { version = "0.12", features = ["json", "blocking", "stream"] } 10 + serde = { version = "1.0", features = ["derive"] } 11 + serde_json = "1.0" 12 + anyhow = "1.0" 13 + tokio = { version = "1" , features = ["macros", "rt-multi-thread"]} 14 + owo-colors = "4" 15 + futures-util = "0.3" 16 +
+36
tiles/src/commands/mod.rs
··· 1 + // Module that handles CLI commands 2 + 3 + use anyhow::Result; 4 + use tilekit::{modelfile, modelfile::Modelfile}; 5 + use tiles::runtime::Runtime; 6 + use tiles::{core::health, runtime::RunArgs}; 7 + const DEFAULT_MODELFILE: &str = " 8 + FROM driaforall/mem-agent-mlx-4bit 9 + "; 10 + 11 + pub async fn run(runtime: &Runtime, modelfile: Option<String>) { 12 + let modelfile_parse_result: Result<Modelfile, String> = if let Some(modelfile_str) = modelfile { 13 + modelfile::parse_from_file(modelfile_str.as_str()) 14 + } else { 15 + modelfile::parse(DEFAULT_MODELFILE) 16 + }; 17 + match modelfile_parse_result { 18 + Ok(modelfile) => { 19 + let run_args = RunArgs { modelfile }; 20 + runtime.run(run_args).await; 21 + } 22 + Err(_err) => println!("Invalid Modelfile"), 23 + } 24 + } 25 + 26 + pub fn check_health() { 27 + health::check_health(); 28 + } 29 + 30 + pub async fn start_server(runtime: &Runtime) { 31 + let _ = runtime.start_server_daemon().await; 32 + } 33 + 34 + pub async fn stop_server(runtime: &Runtime) { 35 + let _ = runtime.stop_server_daemon().await; 36 + }
+1
tiles/src/core/mod.rs
··· 1 + pub mod health;
+26
tiles/src/runtime/cpu.rs
··· 1 + use anyhow::Result; 2 + 3 + pub struct CPURuntime {} 4 + 5 + impl Default for CPURuntime { 6 + fn default() -> Self { 7 + Self::new() 8 + } 9 + } 10 + 11 + impl CPURuntime { 12 + pub fn new() -> Self { 13 + CPURuntime {} 14 + } 15 + pub async fn run(&self, _run_args: super::RunArgs) { 16 + unimplemented!() 17 + } 18 + 19 + pub async fn start_server_daemon(&self) -> Result<()> { 20 + unimplemented!() 21 + } 22 + 23 + pub async fn stop_server_daemon(&self) -> Result<()> { 24 + unimplemented!() 25 + } 26 + }
+49
tiles/src/runtime/mod.rs
··· 1 + #[allow(unused_imports)] 2 + use crate::runtime::cpu::CPURuntime; 3 + use crate::runtime::mlx::MLXRuntime; 4 + use anyhow::Result; 5 + use tilekit::modelfile::Modelfile; 6 + pub mod cpu; 7 + pub mod mlx; 8 + 9 + pub struct RunArgs { 10 + pub modelfile: Modelfile, 11 + } 12 + 13 + pub enum Runtime { 14 + Mlx(MLXRuntime), 15 + Cpu(CPURuntime), 16 + } 17 + 18 + impl Runtime { 19 + pub async fn run(&self, run_args: RunArgs) { 20 + match self { 21 + Runtime::Mlx(runtime) => runtime.run(run_args).await, 22 + Runtime::Cpu(runtime) => runtime.run(run_args).await, 23 + } 24 + } 25 + 26 + pub async fn start_server_daemon(&self) -> Result<()> { 27 + match self { 28 + Runtime::Mlx(runtime) => runtime.start_server_daemon().await, 29 + Runtime::Cpu(runtime) => runtime.start_server_daemon().await, 30 + } 31 + } 32 + 33 + pub async fn stop_server_daemon(&self) -> Result<()> { 34 + match self { 35 + Runtime::Mlx(runtime) => runtime.stop_server_daemon().await, 36 + Runtime::Cpu(runtime) => runtime.stop_server_daemon().await, 37 + } 38 + } 39 + } 40 + 41 + #[cfg(target_os = "macos")] 42 + pub fn build_runtime() -> Runtime { 43 + Runtime::Mlx(MLXRuntime::new()) 44 + } 45 + 46 + #[cfg(not(target_os = "macos"))] 47 + pub fn build_runtime() -> Runtime { 48 + Runtime::Cpu(CPURuntime::new()) 49 + }