Merge pull request #30 from tilesprivacy/feat/basic-linux-compatibility

+20

ATTRIBUTIONS.txt

··· 1 + This project includes code derived from third-party open-source projects. 2 + 3 + --- 4 + 5 + Project: mlx-knife 6 + Author: The BROKE team 🦫 7 + Source: https://github.com/mzau/mlx-knife 8 + License: MIT 9 + 10 + Description: 11 + Modules regarding mlx from mlx-knife has been used as our starting point and for further references 12 + 13 + 14 + Project: mem-agent-mcp 15 + Author: Dria 16 + Source: https://github.com/firstbatchxyz/mem-agent-mcp 17 + License: Apache-2.0 license 18 + 19 + Description: 20 + Modules regarding mem-agent cli from mem-agent-mcp has been used as our starting point and for further references

+154 -168

Cargo.lock

··· 34 34 35 35 [[package]] 36 36 name = "anstyle-query" 37 - version = "1.1.4" 37 + version = "1.1.5" 38 38 source = "registry+https://github.com/rust-lang/crates.io-index" 39 - checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" 39 + checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" 40 40 dependencies = [ 41 - "windows-sys 0.60.2", 41 + "windows-sys 0.61.2", 42 42 ] 43 43 44 44 [[package]] 45 45 name = "anstyle-wincon" 46 - version = "3.0.10" 46 + version = "3.0.11" 47 47 source = "registry+https://github.com/rust-lang/crates.io-index" 48 - checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" 48 + checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" 49 49 dependencies = [ 50 50 "anstyle", 51 51 "once_cell_polyfill", 52 - "windows-sys 0.60.2", 52 + "windows-sys 0.61.2", 53 53 ] 54 54 55 55 [[package]] ··· 78 78 79 79 [[package]] 80 80 name = "bumpalo" 81 - version = "3.19.0" 81 + version = "3.19.1" 82 82 source = "registry+https://github.com/rust-lang/crates.io-index" 83 - checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 83 + checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" 84 84 85 85 [[package]] 86 86 name = "bytes" 87 - version = "1.10.1" 87 + version = "1.11.0" 88 88 source = "registry+https://github.com/rust-lang/crates.io-index" 89 - checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" 89 + checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" 90 90 91 91 [[package]] 92 92 name = "cc" 93 - version = "1.2.41" 93 + version = "1.2.51" 94 94 source = "registry+https://github.com/rust-lang/crates.io-index" 95 - checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" 95 + checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" 96 96 dependencies = [ 97 97 "find-msvc-tools", 98 98 "shlex", ··· 106 106 107 107 [[package]] 108 108 name = "clap" 109 - version = "4.5.50" 109 + version = "4.5.54" 110 110 source = "registry+https://github.com/rust-lang/crates.io-index" 111 - checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" 111 + checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" 112 112 dependencies = [ 113 113 "clap_builder", 114 114 "clap_derive", ··· 116 116 117 117 [[package]] 118 118 name = "clap_builder" 119 - version = "4.5.50" 119 + version = "4.5.54" 120 120 source = "registry+https://github.com/rust-lang/crates.io-index" 121 - checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" 121 + checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" 122 122 dependencies = [ 123 123 "anstream", 124 124 "anstyle", ··· 210 210 211 211 [[package]] 212 212 name = "find-msvc-tools" 213 - version = "0.1.4" 213 + version = "0.1.6" 214 214 source = "registry+https://github.com/rust-lang/crates.io-index" 215 - checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" 215 + checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" 216 216 217 217 [[package]] 218 218 name = "fnv" ··· 350 350 351 351 [[package]] 352 352 name = "hashbrown" 353 - version = "0.16.0" 353 + version = "0.16.1" 354 354 source = "registry+https://github.com/rust-lang/crates.io-index" 355 - checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" 355 + checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" 356 356 357 357 [[package]] 358 358 name = "heck" ··· 362 362 363 363 [[package]] 364 364 name = "http" 365 - version = "1.3.1" 365 + version = "1.4.0" 366 366 source = "registry+https://github.com/rust-lang/crates.io-index" 367 - checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" 367 + checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" 368 368 dependencies = [ 369 369 "bytes", 370 - "fnv", 371 370 "itoa", 372 371 ] 373 372 ··· 402 401 403 402 [[package]] 404 403 name = "hyper" 405 - version = "1.7.0" 404 + version = "1.8.1" 406 405 source = "registry+https://github.com/rust-lang/crates.io-index" 407 - checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" 406 + checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" 408 407 dependencies = [ 409 408 "atomic-waker", 410 409 "bytes", ··· 456 455 457 456 [[package]] 458 457 name = "hyper-util" 459 - version = "0.1.17" 458 + version = "0.1.19" 460 459 source = "registry+https://github.com/rust-lang/crates.io-index" 461 - checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" 460 + checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" 462 461 dependencies = [ 463 462 "base64", 464 463 "bytes", ··· 482 481 483 482 [[package]] 484 483 name = "icu_collections" 485 - version = "2.0.0" 484 + version = "2.1.1" 486 485 source = "registry+https://github.com/rust-lang/crates.io-index" 487 - checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" 486 + checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" 488 487 dependencies = [ 489 488 "displaydoc", 490 489 "potential_utf", ··· 495 494 496 495 [[package]] 497 496 name = "icu_locale_core" 498 - version = "2.0.0" 497 + version = "2.1.1" 499 498 source = "registry+https://github.com/rust-lang/crates.io-index" 500 - checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" 499 + checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" 501 500 dependencies = [ 502 501 "displaydoc", 503 502 "litemap", ··· 508 507 509 508 [[package]] 510 509 name = "icu_normalizer" 511 - version = "2.0.0" 510 + version = "2.1.1" 512 511 source = "registry+https://github.com/rust-lang/crates.io-index" 513 - checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" 512 + checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" 514 513 dependencies = [ 515 - "displaydoc", 516 514 "icu_collections", 517 515 "icu_normalizer_data", 518 516 "icu_properties", ··· 523 521 524 522 [[package]] 525 523 name = "icu_normalizer_data" 526 - version = "2.0.0" 524 + version = "2.1.1" 527 525 source = "registry+https://github.com/rust-lang/crates.io-index" 528 - checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" 526 + checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" 529 527 530 528 [[package]] 531 529 name = "icu_properties" 532 - version = "2.0.1" 530 + version = "2.1.2" 533 531 source = "registry+https://github.com/rust-lang/crates.io-index" 534 - checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" 532 + checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" 535 533 dependencies = [ 536 - "displaydoc", 537 534 "icu_collections", 538 535 "icu_locale_core", 539 536 "icu_properties_data", 540 537 "icu_provider", 541 - "potential_utf", 542 538 "zerotrie", 543 539 "zerovec", 544 540 ] 545 541 546 542 [[package]] 547 543 name = "icu_properties_data" 548 - version = "2.0.1" 544 + version = "2.1.2" 549 545 source = "registry+https://github.com/rust-lang/crates.io-index" 550 - checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" 546 + checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" 551 547 552 548 [[package]] 553 549 name = "icu_provider" 554 - version = "2.0.0" 550 + version = "2.1.1" 555 551 source = "registry+https://github.com/rust-lang/crates.io-index" 556 - checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" 552 + checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" 557 553 dependencies = [ 558 554 "displaydoc", 559 555 "icu_locale_core", 560 - "stable_deref_trait", 561 - "tinystr", 562 556 "writeable", 563 557 "yoke", 564 558 "zerofrom", ··· 589 583 590 584 [[package]] 591 585 name = "indexmap" 592 - version = "2.12.0" 586 + version = "2.12.1" 593 587 source = "registry+https://github.com/rust-lang/crates.io-index" 594 - checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" 588 + checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" 595 589 dependencies = [ 596 590 "equivalent", 597 591 "hashbrown", ··· 605 599 606 600 [[package]] 607 601 name = "iri-string" 608 - version = "0.7.8" 602 + version = "0.7.10" 609 603 source = "registry+https://github.com/rust-lang/crates.io-index" 610 - checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" 604 + checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" 611 605 dependencies = [ 612 606 "memchr", 613 607 "serde", ··· 615 609 616 610 [[package]] 617 611 name = "is_terminal_polyfill" 618 - version = "1.70.1" 612 + version = "1.70.2" 619 613 source = "registry+https://github.com/rust-lang/crates.io-index" 620 - checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 614 + checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" 621 615 622 616 [[package]] 623 617 name = "itoa" 624 - version = "1.0.15" 618 + version = "1.0.17" 625 619 source = "registry+https://github.com/rust-lang/crates.io-index" 626 - checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 620 + checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" 627 621 628 622 [[package]] 629 623 name = "js-sys" 630 - version = "0.3.81" 624 + version = "0.3.83" 631 625 source = "registry+https://github.com/rust-lang/crates.io-index" 632 - checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" 626 + checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" 633 627 dependencies = [ 634 628 "once_cell", 635 629 "wasm-bindgen", ··· 637 631 638 632 [[package]] 639 633 name = "libc" 640 - version = "0.2.177" 634 + version = "0.2.179" 641 635 source = "registry+https://github.com/rust-lang/crates.io-index" 642 - checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" 636 + checksum = "c5a2d376baa530d1238d133232d15e239abad80d05838b4b59354e5268af431f" 643 637 644 638 [[package]] 645 639 name = "linux-raw-sys" ··· 649 643 650 644 [[package]] 651 645 name = "litemap" 652 - version = "0.8.0" 646 + version = "0.8.1" 653 647 source = "registry+https://github.com/rust-lang/crates.io-index" 654 - checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" 648 + checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" 655 649 656 650 [[package]] 657 651 name = "log" 658 - version = "0.4.28" 652 + version = "0.4.29" 659 653 source = "registry+https://github.com/rust-lang/crates.io-index" 660 - checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" 654 + checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" 661 655 662 656 [[package]] 663 657 name = "memchr" ··· 673 667 674 668 [[package]] 675 669 name = "mio" 676 - version = "1.1.0" 670 + version = "1.1.1" 677 671 source = "registry+https://github.com/rust-lang/crates.io-index" 678 - checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" 672 + checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" 679 673 dependencies = [ 680 674 "libc", 681 675 "wasi", ··· 716 710 717 711 [[package]] 718 712 name = "once_cell_polyfill" 719 - version = "1.70.1" 713 + version = "1.70.2" 720 714 source = "registry+https://github.com/rust-lang/crates.io-index" 721 - checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" 715 + checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" 722 716 723 717 [[package]] 724 718 name = "openssl" 725 - version = "0.10.74" 719 + version = "0.10.75" 726 720 source = "registry+https://github.com/rust-lang/crates.io-index" 727 - checksum = "24ad14dd45412269e1a30f52ad8f0664f0f4f4a89ee8fe28c3b3527021ebb654" 721 + checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" 728 722 dependencies = [ 729 723 "bitflags", 730 724 "cfg-if", ··· 754 748 755 749 [[package]] 756 750 name = "openssl-sys" 757 - version = "0.9.110" 751 + version = "0.9.111" 758 752 source = "registry+https://github.com/rust-lang/crates.io-index" 759 - checksum = "0a9f0075ba3c21b09f8e8b2026584b1d18d49388648f2fbbf3c97ea8deced8e2" 753 + checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" 760 754 dependencies = [ 761 755 "cc", 762 756 "libc", ··· 796 790 797 791 [[package]] 798 792 name = "potential_utf" 799 - version = "0.1.3" 793 + version = "0.1.4" 800 794 source = "registry+https://github.com/rust-lang/crates.io-index" 801 - checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" 795 + checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" 802 796 dependencies = [ 803 797 "zerovec", 804 798 ] 805 799 806 800 [[package]] 807 801 name = "proc-macro2" 808 - version = "1.0.101" 802 + version = "1.0.104" 809 803 source = "registry+https://github.com/rust-lang/crates.io-index" 810 - checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" 804 + checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" 811 805 dependencies = [ 812 806 "unicode-ident", 813 807 ] 814 808 815 809 [[package]] 816 810 name = "quote" 817 - version = "1.0.41" 811 + version = "1.0.42" 818 812 source = "registry+https://github.com/rust-lang/crates.io-index" 819 - checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" 813 + checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" 820 814 dependencies = [ 821 815 "proc-macro2", 822 816 ] ··· 829 823 830 824 [[package]] 831 825 name = "reqwest" 832 - version = "0.12.24" 826 + version = "0.12.28" 833 827 source = "registry+https://github.com/rust-lang/crates.io-index" 834 - checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" 828 + checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" 835 829 dependencies = [ 836 830 "base64", 837 831 "bytes", ··· 887 881 888 882 [[package]] 889 883 name = "rustix" 890 - version = "1.1.2" 884 + version = "1.1.3" 891 885 source = "registry+https://github.com/rust-lang/crates.io-index" 892 - checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" 886 + checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" 893 887 dependencies = [ 894 888 "bitflags", 895 889 "errno", ··· 900 894 901 895 [[package]] 902 896 name = "rustls" 903 - version = "0.23.33" 897 + version = "0.23.35" 904 898 source = "registry+https://github.com/rust-lang/crates.io-index" 905 - checksum = "751e04a496ca00bb97a5e043158d23d66b5aabf2e1d5aa2a0aaebb1aafe6f82c" 899 + checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" 906 900 dependencies = [ 907 901 "once_cell", 908 902 "rustls-pki-types", ··· 913 907 914 908 [[package]] 915 909 name = "rustls-pki-types" 916 - version = "1.12.0" 910 + version = "1.13.2" 917 911 source = "registry+https://github.com/rust-lang/crates.io-index" 918 - checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" 912 + checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" 919 913 dependencies = [ 920 914 "zeroize", 921 915 ] 922 916 923 917 [[package]] 924 918 name = "rustls-webpki" 925 - version = "0.103.7" 919 + version = "0.103.8" 926 920 source = "registry+https://github.com/rust-lang/crates.io-index" 927 - checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" 921 + checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" 928 922 dependencies = [ 929 923 "ring", 930 924 "rustls-pki-types", ··· 939 933 940 934 [[package]] 941 935 name = "ryu" 942 - version = "1.0.20" 936 + version = "1.0.22" 943 937 source = "registry+https://github.com/rust-lang/crates.io-index" 944 - checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 938 + checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" 945 939 946 940 [[package]] 947 941 name = "schannel" ··· 1007 1001 1008 1002 [[package]] 1009 1003 name = "serde_json" 1010 - version = "1.0.145" 1004 + version = "1.0.148" 1011 1005 source = "registry+https://github.com/rust-lang/crates.io-index" 1012 - checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 1006 + checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da" 1013 1007 dependencies = [ 1014 1008 "itoa", 1015 1009 "memchr", 1016 - "ryu", 1017 1010 "serde", 1018 1011 "serde_core", 1012 + "zmij", 1019 1013 ] 1020 1014 1021 1015 [[package]] ··· 1078 1072 1079 1073 [[package]] 1080 1074 name = "syn" 1081 - version = "2.0.107" 1075 + version = "2.0.113" 1082 1076 source = "registry+https://github.com/rust-lang/crates.io-index" 1083 - checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" 1077 + checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4" 1084 1078 dependencies = [ 1085 1079 "proc-macro2", 1086 1080 "quote", ··· 1130 1124 1131 1125 [[package]] 1132 1126 name = "tempfile" 1133 - version = "3.23.0" 1127 + version = "3.24.0" 1134 1128 source = "registry+https://github.com/rust-lang/crates.io-index" 1135 - checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" 1129 + checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" 1136 1130 dependencies = [ 1137 1131 "fastrand", 1138 1132 "getrandom 0.3.4", ··· 1142 1136 ] 1143 1137 1144 1138 [[package]] 1139 + name = "tilekit" 1140 + version = "0.1.0" 1141 + dependencies = [ 1142 + "nom", 1143 + ] 1144 + 1145 + [[package]] 1145 1146 name = "tiles" 1146 - version = "0.2.0" 1147 + version = "0.3.0" 1147 1148 dependencies = [ 1148 1149 "anyhow", 1149 1150 "clap", 1150 1151 "futures-util", 1151 - "nom", 1152 1152 "owo-colors", 1153 1153 "reqwest", 1154 1154 "serde", 1155 1155 "serde_json", 1156 + "tilekit", 1156 1157 "tokio", 1157 1158 ] 1158 1159 1159 1160 [[package]] 1160 1161 name = "tinystr" 1161 - version = "0.8.1" 1162 + version = "0.8.2" 1162 1163 source = "registry+https://github.com/rust-lang/crates.io-index" 1163 - checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" 1164 + checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" 1164 1165 dependencies = [ 1165 1166 "displaydoc", 1166 1167 "zerovec", ··· 1168 1169 1169 1170 [[package]] 1170 1171 name = "tokio" 1171 - version = "1.48.0" 1172 + version = "1.49.0" 1172 1173 source = "registry+https://github.com/rust-lang/crates.io-index" 1173 - checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" 1174 + checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" 1174 1175 dependencies = [ 1175 1176 "bytes", 1176 1177 "libc", ··· 1214 1215 1215 1216 [[package]] 1216 1217 name = "tokio-util" 1217 - version = "0.7.16" 1218 + version = "0.7.18" 1218 1219 source = "registry+https://github.com/rust-lang/crates.io-index" 1219 - checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" 1220 + checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" 1220 1221 dependencies = [ 1221 1222 "bytes", 1222 1223 "futures-core", ··· 1242 1243 1243 1244 [[package]] 1244 1245 name = "tower-http" 1245 - version = "0.6.6" 1246 + version = "0.6.8" 1246 1247 source = "registry+https://github.com/rust-lang/crates.io-index" 1247 - checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" 1248 + checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" 1248 1249 dependencies = [ 1249 1250 "bitflags", 1250 1251 "bytes", ··· 1272 1273 1273 1274 [[package]] 1274 1275 name = "tracing" 1275 - version = "0.1.41" 1276 + version = "0.1.44" 1276 1277 source = "registry+https://github.com/rust-lang/crates.io-index" 1277 - checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" 1278 + checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" 1278 1279 dependencies = [ 1279 1280 "pin-project-lite", 1280 1281 "tracing-core", ··· 1282 1283 1283 1284 [[package]] 1284 1285 name = "tracing-core" 1285 - version = "0.1.34" 1286 + version = "0.1.36" 1286 1287 source = "registry+https://github.com/rust-lang/crates.io-index" 1287 - checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" 1288 + checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" 1288 1289 dependencies = [ 1289 1290 "once_cell", 1290 1291 ] ··· 1297 1298 1298 1299 [[package]] 1299 1300 name = "unicode-ident" 1300 - version = "1.0.19" 1301 + version = "1.0.22" 1301 1302 source = "registry+https://github.com/rust-lang/crates.io-index" 1302 - checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" 1303 + checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" 1303 1304 1304 1305 [[package]] 1305 1306 name = "untrusted" ··· 1363 1364 1364 1365 [[package]] 1365 1366 name = "wasm-bindgen" 1366 - version = "0.2.104" 1367 + version = "0.2.106" 1367 1368 source = "registry+https://github.com/rust-lang/crates.io-index" 1368 - checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" 1369 + checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" 1369 1370 dependencies = [ 1370 1371 "cfg-if", 1371 1372 "once_cell", ··· 1375 1376 ] 1376 1377 1377 1378 [[package]] 1378 - name = "wasm-bindgen-backend" 1379 - version = "0.2.104" 1380 - source = "registry+https://github.com/rust-lang/crates.io-index" 1381 - checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" 1382 - dependencies = [ 1383 - "bumpalo", 1384 - "log", 1385 - "proc-macro2", 1386 - "quote", 1387 - "syn", 1388 - "wasm-bindgen-shared", 1389 - ] 1390 - 1391 - [[package]] 1392 1379 name = "wasm-bindgen-futures" 1393 - version = "0.4.54" 1380 + version = "0.4.56" 1394 1381 source = "registry+https://github.com/rust-lang/crates.io-index" 1395 - checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" 1382 + checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" 1396 1383 dependencies = [ 1397 1384 "cfg-if", 1398 1385 "js-sys", ··· 1403 1390 1404 1391 [[package]] 1405 1392 name = "wasm-bindgen-macro" 1406 - version = "0.2.104" 1393 + version = "0.2.106" 1407 1394 source = "registry+https://github.com/rust-lang/crates.io-index" 1408 - checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" 1395 + checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" 1409 1396 dependencies = [ 1410 1397 "quote", 1411 1398 "wasm-bindgen-macro-support", ··· 1413 1400 1414 1401 [[package]] 1415 1402 name = "wasm-bindgen-macro-support" 1416 - version = "0.2.104" 1403 + version = "0.2.106" 1417 1404 source = "registry+https://github.com/rust-lang/crates.io-index" 1418 - checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" 1405 + checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" 1419 1406 dependencies = [ 1407 + "bumpalo", 1420 1408 "proc-macro2", 1421 1409 "quote", 1422 1410 "syn", 1423 - "wasm-bindgen-backend", 1424 1411 "wasm-bindgen-shared", 1425 1412 ] 1426 1413 1427 1414 [[package]] 1428 1415 name = "wasm-bindgen-shared" 1429 - version = "0.2.104" 1416 + version = "0.2.106" 1430 1417 source = "registry+https://github.com/rust-lang/crates.io-index" 1431 - checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" 1418 + checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" 1432 1419 dependencies = [ 1433 1420 "unicode-ident", 1434 1421 ] ··· 1448 1435 1449 1436 [[package]] 1450 1437 name = "web-sys" 1451 - version = "0.3.81" 1438 + version = "0.3.83" 1452 1439 source = "registry+https://github.com/rust-lang/crates.io-index" 1453 - checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" 1440 + checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" 1454 1441 dependencies = [ 1455 1442 "js-sys", 1456 1443 "wasm-bindgen", ··· 1458 1445 1459 1446 [[package]] 1460 1447 name = "windows-link" 1461 - version = "0.1.3" 1462 - source = "registry+https://github.com/rust-lang/crates.io-index" 1463 - checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" 1464 - 1465 - [[package]] 1466 - name = "windows-link" 1467 1448 version = "0.2.1" 1468 1449 source = "registry+https://github.com/rust-lang/crates.io-index" 1469 1450 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 1470 1451 1471 1452 [[package]] 1472 1453 name = "windows-registry" 1473 - version = "0.5.3" 1454 + version = "0.6.1" 1474 1455 source = "registry+https://github.com/rust-lang/crates.io-index" 1475 - checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" 1456 + checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" 1476 1457 dependencies = [ 1477 - "windows-link 0.1.3", 1458 + "windows-link", 1478 1459 "windows-result", 1479 1460 "windows-strings", 1480 1461 ] 1481 1462 1482 1463 [[package]] 1483 1464 name = "windows-result" 1484 - version = "0.3.4" 1465 + version = "0.4.1" 1485 1466 source = "registry+https://github.com/rust-lang/crates.io-index" 1486 - checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" 1467 + checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" 1487 1468 dependencies = [ 1488 - "windows-link 0.1.3", 1469 + "windows-link", 1489 1470 ] 1490 1471 1491 1472 [[package]] 1492 1473 name = "windows-strings" 1493 - version = "0.4.2" 1474 + version = "0.5.1" 1494 1475 source = "registry+https://github.com/rust-lang/crates.io-index" 1495 - checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" 1476 + checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" 1496 1477 dependencies = [ 1497 - "windows-link 0.1.3", 1478 + "windows-link", 1498 1479 ] 1499 1480 1500 1481 [[package]] ··· 1521 1502 source = "registry+https://github.com/rust-lang/crates.io-index" 1522 1503 checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" 1523 1504 dependencies = [ 1524 - "windows-link 0.2.1", 1505 + "windows-link", 1525 1506 ] 1526 1507 1527 1508 [[package]] ··· 1546 1527 source = "registry+https://github.com/rust-lang/crates.io-index" 1547 1528 checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" 1548 1529 dependencies = [ 1549 - "windows-link 0.2.1", 1530 + "windows-link", 1550 1531 "windows_aarch64_gnullvm 0.53.1", 1551 1532 "windows_aarch64_msvc 0.53.1", 1552 1533 "windows_i686_gnu 0.53.1", ··· 1661 1642 1662 1643 [[package]] 1663 1644 name = "writeable" 1664 - version = "0.6.1" 1645 + version = "0.6.2" 1665 1646 source = "registry+https://github.com/rust-lang/crates.io-index" 1666 - checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" 1647 + checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" 1667 1648 1668 1649 [[package]] 1669 1650 name = "yoke" 1670 - version = "0.8.0" 1651 + version = "0.8.1" 1671 1652 source = "registry+https://github.com/rust-lang/crates.io-index" 1672 - checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" 1653 + checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" 1673 1654 dependencies = [ 1674 - "serde", 1675 1655 "stable_deref_trait", 1676 1656 "yoke-derive", 1677 1657 "zerofrom", ··· 1679 1659 1680 1660 [[package]] 1681 1661 name = "yoke-derive" 1682 - version = "0.8.0" 1662 + version = "0.8.1" 1683 1663 source = "registry+https://github.com/rust-lang/crates.io-index" 1684 - checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" 1664 + checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" 1685 1665 dependencies = [ 1686 1666 "proc-macro2", 1687 1667 "quote", ··· 1718 1698 1719 1699 [[package]] 1720 1700 name = "zerotrie" 1721 - version = "0.2.2" 1701 + version = "0.2.3" 1722 1702 source = "registry+https://github.com/rust-lang/crates.io-index" 1723 - checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" 1703 + checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" 1724 1704 dependencies = [ 1725 1705 "displaydoc", 1726 1706 "yoke", ··· 1729 1709 1730 1710 [[package]] 1731 1711 name = "zerovec" 1732 - version = "0.11.4" 1712 + version = "0.11.5" 1733 1713 source = "registry+https://github.com/rust-lang/crates.io-index" 1734 - checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" 1714 + checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" 1735 1715 dependencies = [ 1736 1716 "yoke", 1737 1717 "zerofrom", ··· 1740 1720 1741 1721 [[package]] 1742 1722 name = "zerovec-derive" 1743 - version = "0.11.1" 1723 + version = "0.11.2" 1744 1724 source = "registry+https://github.com/rust-lang/crates.io-index" 1745 - checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" 1725 + checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" 1746 1726 dependencies = [ 1747 1727 "proc-macro2", 1748 1728 "quote", 1749 1729 "syn", 1750 1730 ] 1731 + 1732 + [[package]] 1733 + name = "zmij" 1734 + version = "1.0.10" 1735 + source = "registry+https://github.com/rust-lang/crates.io-index" 1736 + checksum = "30e0d8dffbae3d840f64bda38e28391faef673a7b5a6017840f2a106c8145868"

+6 -14

Cargo.toml

··· 1 - [package] 2 - name = "tiles" 3 - version = "0.2.0" 4 - edition = "2024" 1 + [workspace] 2 + resolver = "3" 3 + members = [ 4 + "tilekit", 5 + "tiles", 6 + ] 5 7 6 - [dependencies] 7 - clap = { version = "4.5.48", features = ["derive"] } 8 - nom = "8" 9 - reqwest = { version = "0.12", features = ["json", "blocking", "stream"] } 10 - serde = { version = "1.0", features = ["derive"] } 11 - serde_json = "1.0" 12 - anyhow = "1.0" 13 - tokio = { version = "1" , features = ["macros", "rt-multi-thread"]} 14 - owo-colors = "4" 15 - futures-util = "0.3"

+6

HACKING.md

··· 30 30 ``` 31 31 32 32 2. In another terminal, run the Rust CLI using Cargo as usual. 33 + 34 + ```sh 35 + cd tiles 36 + 37 + cargo run 38 + ```

fixtures/a.modelfile tilekit/fixtures/a.modelfile

fixtures/llama_bad.Modelfile tilekit/fixtures/llama_bad.Modelfile

fixtures/mistral.modelfile tilekit/fixtures/mistral.modelfile

+2 -1

justfile

··· 12 12 cargo test 13 13 14 14 serve: 15 - uv run --project server python -m server.main 15 + server/.venv/bin/python3 -m server.main 16 + # uv run --project server python -m server.main 16 17 17 18 bundle: 18 19 ./scripts/bundler.sh

+1 -1

scripts/bundler.sh

··· 22 22 rm -rf "${DIST_DIR}/tmp/server/.venv" 23 23 24 24 echo "📦 Creating ${OUT_NAME}.tar.gz..." 25 - tar -czf "${DIST_DIR}/${OUT_NAME}.tar.gz" -C "${DIST_DIR}/tmp" . 25 + tar --exclude-from=scripts/tar.exclude -czf "${DIST_DIR}/${OUT_NAME}.tar.gz" -C "${DIST_DIR}/tmp" . 26 26 27 27 rm -rf "${DIST_DIR}/tmp" 28 28

+5 -5

scripts/install.sh

··· 1 1 #!/usr/bin/env bash 2 2 set -euo pipefail 3 3 4 - ENV="prod" # prod is another env, try taking it from github env 5 - REPO="tilesprivacy/tilekit" 4 + ENV="dev" # prod is another env, try taking it from github env 5 + REPO="tilesprivacy/tiles" 6 6 # VERSION="${TILES_VERSION:-latest}" 7 - VERSION="0.2.0" 7 + VERSION="0.3.0" 8 8 INSTALL_DIR="$HOME/.local/bin" # CLI install location 9 9 SERVER_DIR="$HOME/.local/share/tiles/server" # Python server folder 10 10 TMPDIR="$(mktemp -d)" ··· 43 43 TAR_URL="https://github.com/${REPO}/releases/download/${VERSION}/tiles-v${VERSION}-${ARCH}-${OS}.tar.gz" 44 44 curl -fsSL -o "${TMPDIR}/tiles.tar.gz" "$TAR_URL" 45 45 else 46 - # Installer suppose to ran from tilekit root folder after running the bundler 46 + # Installer suppose to ran from tiles root folder after running the bundler 47 47 mv "dist/tiles-v${VERSION}-${ARCH}-${OS}.tar.gz" "${TMPDIR}/tiles.tar.gz" 48 48 fi 49 49 ··· 71 71 log "Installing Python 3.13 via Homebrew..." 72 72 brew install python@3.13 || err "Failed to install Python 3.13" 73 73 else 74 - err "Python 3.13 is required but not found. Please install it manually." 74 + err "Python 3.13 is required but not found. Please install it manuallyv and retry installing tiles" 75 75 fi 76 76 fi 77 77

+8

scripts/tar.exclude

··· 1 + __pycache__ 2 + *.pyc 3 + *.pyo 4 + .venv 5 + .env 6 + .git 7 + .DS_Store 8 +

+1

server/.gitignore

··· 1 1 __pycache__/ 2 2 *.egg-info/ 3 3 .venv/ 4 + backend/__pycache__

+27 -414

server/api.py

··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 - 23 1 from fastapi import FastAPI, HTTPException 2 + 3 + from .schemas import ChatMessage, ChatCompletionRequest, StartRequest, downloadRequest 24 4 from .config import SYSTEM_PROMPT 25 5 import logging 26 - import json 27 - import time 28 - import uuid 29 - from collections.abc import AsyncGenerator 30 - from typing import Any, Dict, List, Optional, Union 6 + import sys 7 + from typing import Optional 31 8 32 9 from fastapi.responses import StreamingResponse 33 10 from pydantic import BaseModel, Field 34 11 35 - from .cache_utils import ( 36 - get_model_path 37 - ) 38 12 from .hf_downloader import pull_model 39 13 40 - from .mlx_runner import MLXRunner 41 - 42 - from server.mem_agent.utils import extract_python_code, extract_reply, extract_thoughts, create_memory_if_not_exists, format_results 14 + from server.mem_agent.utils import ( 15 + create_memory_if_not_exists, 16 + format_results, 17 + ) 43 18 from server.mem_agent.engine import execute_sandboxed_code 44 - # Global model cache and configuration 19 + 20 + from . import runtime 45 21 46 22 logger = logging.getLogger("app") 47 - _model_cache: Dict[str, MLXRunner] = {} 48 23 _current_model_path: Optional[str] = None 49 24 _default_max_tokens: Optional[int] = None # Use dynamic model-aware limits by default 50 - _runner: MLXRunner = {} 51 - _max_tool_turns = 5 52 25 _memory_path = "" 53 26 54 - class CompletionRequest(BaseModel): 55 - model: str 56 - prompt: Union[str, List[str]] 57 - max_tokens: Optional[int] = None 58 - temperature: Optional[float] = 0.7 59 - top_p: Optional[float] = 0.9 60 - stream: Optional[bool] = False 61 - stop: Optional[Union[str, List[str]]] = None 62 - repetition_penalty: Optional[float] = 1.1 27 + _messages: list[ChatMessage] = [] 63 28 64 29 65 - class ChatMessage(BaseModel): 66 - role: str = Field(..., pattern="^(system|user|assistant)$") 67 - content: str 68 - 69 - _messages: list[ChatMessage]= [] 70 - 71 - class ChatCompletionRequest(BaseModel): 72 - model: str 73 - messages: List[ChatMessage] 74 - chat_start: bool 75 - python_code: str 76 - max_tokens: Optional[int] = None 77 - temperature: Optional[float] = 0.7 78 - top_p: Optional[float] = 0.9 79 - stream: Optional[bool] = False 80 - stop: Optional[Union[str, List[str]]] = None 81 - repetition_penalty: Optional[float] = 1.1 82 - 83 - 84 - class CompletionResponse(BaseModel): 85 - id: str 86 - object: str = "text_completion" 87 - created: int 88 - model: str 89 - choices: List[Dict[str, Any]] 90 - usage: Dict[str, int] 91 - 92 - 93 - class ChatCompletionResponse(BaseModel): 94 - id: str 95 - object: str = "chat.completion" 96 - created: int 97 - model: str 98 - choices: List[Dict[str, Any]] 99 - # usage: Dict[str, int] 100 - 101 - 102 - class ModelInfo(BaseModel): 103 - id: str 104 - object: str = "model" 105 - owned_by: str = "mlx-knife" 106 - permission: List = [] 107 - context_length: Optional[int] = None 108 - 109 - class StartRequest(BaseModel): 110 - model: str 111 - memory_path: str 112 - 113 - class downloadRequest(BaseModel): 114 - model: str 115 - 116 - class Agent: 117 - def __init__( 118 - self, 119 - max_tool_turns: int = 20, 120 - memory_path: str = None, 121 - use_vllm: bool = False, 122 - model: str = None, 123 - predetermined_memory_path: bool = False, 124 - model_cache: Dict[str, MLXRunner] = {}, 125 - current_model_path: Optional[str] = None, 126 - default_max_tokens: Optional[int] = None # Use dynamic model-aware limits by default 127 - 128 - ): 129 - # Load the system prompt and add it to the conversation history 130 - self.system_prompt = SYSTEM_PROMPT 131 - self.messages: list[ChatMessage] = [ 132 - ChatMessage(role="system", content=self.system_prompt) 133 - ] 134 - 135 - # Set the maximum number of tool turns and use_vllm flag 136 - self.max_tool_turns = max_tool_turns 137 - self.use_vllm = use_vllm 138 - 139 30 app = FastAPI() 140 31 141 - agent: Agent() 142 - 143 - def get_or_load_model(model_spec: str, verbose: bool = False) -> MLXRunner: 144 - """Get model from cache or load it if not cached.""" 145 - global _model_cache, _current_model_path 146 - 147 - # Use the existing model path resolution from cache_utils 148 - 149 - try: 150 - model_path, model_name, commit_hash = get_model_path(model_spec) 151 - if not model_path.exists(): 152 - logger.info(f"Model {model_spec} not found in cache") 153 - raise HTTPException(status_code=404, detail=f"Model {model_spec} not found in cache") 154 - except Exception as e: 155 - logger.info(f"Model {model_spec} not found in: {str(e)}") 156 - raise HTTPException(status_code=404, detail=f"Model {model_spec} not found: {str(e)}") 157 - 158 - # Check if it's an MLX model 159 - 160 - model_path_str = str(model_path) 161 - 162 - # Check if we need to load a different model 163 - if _current_model_path != model_path_str: 164 - # Proactively clean up any previously loaded runner to release memory 165 - if _model_cache: 166 - try: 167 - for _old_runner in list(_model_cache.values()): 168 - try: 169 - _old_runner.cleanup() 170 - except Exception: 171 - pass 172 - finally: 173 - _model_cache.clear() 174 - 175 - # Load new model 176 - if verbose: 177 - print(f"Loading model: {model_name}") 178 - 179 - logger.info(f"Loading model: {model_name}") 180 - runner = MLXRunner(model_path_str, verbose=verbose) 181 - runner.load_model() 182 - 183 - _model_cache[model_path_str] = runner 184 - _current_model_path = model_path_str 185 - else: 186 - logger.info(f"Model {model_name} already in memory") 187 - 188 - return _model_cache[model_path_str] 189 - 190 - def format_chat_messages_for_runner(messages: List[ChatMessage]) -> List[Dict[str, str]]: 191 - """Convert chat messages to format expected by MLXRunner. 192 - 193 - Returns messages in dict format for the runner to apply chat templates. 194 - """ 195 - return [{"role": msg.role, "content": msg.content} for msg in messages] 196 - 197 - 198 - def count_tokens(text: str) -> int: 199 - """Rough token count estimation.""" 200 - return int(len(text.split()) * 1.3) # Approximation, convert to int 201 32 202 33 @app.get("/ping") 203 34 async def ping(): 204 - return {"message": "Badda-Bing Badda-Bang"} 35 + return {"message": "Badda-Bing Badda-Bang"} 36 + 205 37 206 38 @app.post("/download") 207 - async def download(request:downloadRequest): 208 - """ Download the model """ 209 - try: 210 - if pull_model(request.model): 211 - return {"message": "Model downloaded"} 212 - else: 213 - raise HTTPException(status_code=400, detail="Downloading model failed") 214 - except Exception as e: 215 - raise HTTPException(status_code=500, detail=str(e)) 39 + async def download(request: downloadRequest): 40 + """Download the model""" 41 + runtime.backend.download_model(request.model) 216 42 217 43 @app.post("/start") 218 44 async def start_model(request: StartRequest): 219 45 """Load the model and start the agent""" 220 - global _messages, _runner,_memory_path 46 + global _messages, _runner, _memory_path 221 47 222 48 _messages = [ChatMessage(role="system", content=SYSTEM_PROMPT)] 223 49 _memory_path = request.memory_path 50 + logger.info(f"{runtime.backend}") 51 + runtime.backend.get_or_load_model(request.model) 52 + return {"message": "Model loaded"} 224 53 225 - _runner = get_or_load_model(request.model) 226 - return {"message": "Model loaded"} 227 54 228 55 @app.post("/v1/chat/completions") 229 56 async def create_chat_completion(request: ChatCompletionRequest): 230 57 """Create a chat completion.""" 231 - global _messages, _max_tool_turns, _memory_path 58 + global _messages, _memory_path 232 59 try: 233 - runner = get_or_load_model(request.model) 234 60 235 61 if request.stream: 236 62 result = ({}, "") 237 63 if request.python_code: 238 - create_memory_if_not_exists() 239 64 result = execute_sandboxed_code( 240 65 code=request.python_code, 241 66 allowed_path=_memory_path, 242 67 import_module="server.mem_agent.tools", 243 68 ) 244 69 245 - _messages.append(ChatMessage(role="user", content=format_results(result[0], result[1]))) 246 - 70 + _messages.append( 71 + ChatMessage(role="user", content=format_results(result[0], result[1])) 72 + ) 73 + 247 74 # Streaming response 248 75 return StreamingResponse( 249 - generate_chat_stream(runner, request.messages, request), 76 + runtime.backend.generate_chat_stream(_messages, request), 250 77 media_type="text/plain", 251 - headers={"Cache-Control": "no-cache"} 252 - ) 253 - else: 254 - # Non-streaming response 255 - completion_id = f"chatcmpl-{uuid.uuid4()}" 256 - created = int(time.time()) 257 - 258 - # Convert messages to dict format for runner 259 - # _messages.append(system_message) 260 - if request.chat_start: 261 - _messages.extend(request.messages) 262 - message_dicts = format_chat_messages_for_runner(_messages) 263 - # Let the runner format with chat templates 264 - prompt = runner._format_conversation(message_dicts, use_chat_template=True) 265 - 266 - generated_text = runner.generate_batch( 267 - prompt=prompt, 268 - max_tokens=runner.get_effective_max_tokens(request.max_tokens or _default_max_tokens, interactive=False), 269 - temperature=request.temperature, 270 - top_p=request.top_p, 271 - repetition_penalty=request.repetition_penalty, 272 - use_chat_template=False # Already applied in _format_conversation 273 - ) 274 - 275 - # Token counting 276 - total_prompt = "\n\n".join([msg.content for msg in request.messages]) 277 - prompt_tokens = count_tokens(total_prompt) 278 - completion_tokens = count_tokens(generated_text) 279 - 280 - logger.info(f"prompt_token\n{prompt_tokens}") 281 - logger.info(f"completion_tokens\n{completion_tokens}") 282 - 283 - thoughts = extract_thoughts(generated_text) 284 - reply = extract_reply(generated_text) 285 - python_code = extract_python_code(generated_text) 286 - 287 - result = ({}, "") 288 - if python_code: 289 - create_memory_if_not_exists() 290 - result = execute_sandboxed_code( 291 - code=python_code, 292 - allowed_path=_memory_path, 293 - import_module="server.mem_agent.tools", 294 - ) 295 - 296 - logger.info(f"Model thoughts\n{thoughts}") 297 - logger.info(f"Model reply\n{reply}") 298 - logger.info(f"Model python\n{python_code}") 299 - logger.info(f"executed python result\n{str(result)}") 300 - 301 - # while remaining_tool_turns > 0 and not reply: 302 - # logger.info(f"Turn count\n{remaining_tool_turns}") 303 - _messages.append(ChatMessage(role="user", content=format_results(result[0], result[1]))) 304 - message_dicts = format_chat_messages_for_runner(_messages) 305 - # # Let the runner format with chat templates 306 - # prompt = runner._format_conversation(message_dicts, use_chat_template=True) 307 - # generated_text = runner.generate_batch( 308 - # prompt=prompt 309 - # ) 310 - 311 - # total_prompt = "\n\n".join([msg.content for msg in _messages]) 312 - # prompt_tokens = count_tokens(total_prompt) 313 - # completion_tokens = count_tokens(generated_text) 314 - 315 - # logger.info(f"prompt_token\n{prompt_tokens}") 316 - # logger.info(f"completion_tokens\n{completion_tokens}") 317 - 318 - # # print(generated_text) 319 - # # Extract the thoughts, reply and python code from the response 320 - # thoughts = extract_thoughts(generated_text) 321 - # reply = extract_reply(generated_text) 322 - # python_code = extract_python_code(generated_text) 323 - 324 - # logger.info(f"Model thoughts\n{thoughts}") 325 - # logger.info(f"Model reply\n{reply}") 326 - # logger.info(f"Model python\n{python_code}") 327 - 328 - # _messages.append(ChatMessage(role="assistant", content=generated_text)) 329 - # if python_code: 330 - # create_memory_if_not_exists() 331 - # result = execute_sandboxed_code( 332 - # code=python_code, 333 - # allowed_path=_memory_path, 334 - # import_module="server.mem_agent.tools", 335 - # ) 336 - # logger.info(f"executed python result\n{str(result)}") 337 - # else: 338 - # # Reset result when no Python code is executed 339 - # result = ({}, "") 340 - # logger.info(f"executed python result\n{str(result)}") 341 - # remaining_tool_turns -= 1 342 - 343 - return ChatCompletionResponse( 344 - id=completion_id, 345 - created=created, 346 - model=request.model, 347 - choices=[ 348 - { 349 - "index": 0, 350 - "message": { 351 - "role": "assistant", 352 - "content": generated_text 353 - }, 354 - "finish_reason": "stop" 355 - } 356 - ], 357 - # usage={ 358 - # "prompt_tokens": prompt_tokens, 359 - # "completion_tokens": completion_tokens, 360 - # "total_tokens": prompt_tokens + completion_tokens 361 - # } 78 + headers={"Cache-Control": "no-cache"}, 362 79 ) 363 80 except Exception as e: 364 81 raise HTTPException(status_code=500, detail=str(e)) 365 - 366 - async def generate_chat_stream( 367 - runner: MLXRunner, 368 - messages: List[ChatMessage], 369 - request: ChatCompletionRequest 370 - ) -> AsyncGenerator[str, None]: 371 - """Generate streaming chat completion response.""" 372 - 373 - global _messages 374 - completion_id = f"chatcmpl-{uuid.uuid4()}" 375 - created = int(time.time()) 376 - 377 - if request.chat_start: 378 - _messages.extend(request.messages) 379 - # Convert messages to dict format for runner 380 - message_dicts = format_chat_messages_for_runner(_messages) 381 - 382 - # Let the runner format with chat templates 383 - prompt = runner._format_conversation(message_dicts, use_chat_template=True) 384 - 385 - # Yield initial response 386 - initial_response = { 387 - "id": completion_id, 388 - "object": "chat.completion.chunk", 389 - "created": created, 390 - "model": request.model, 391 - "choices": [ 392 - { 393 - "index": 0, 394 - "delta": {"role": "assistant", "content": ""}, 395 - "finish_reason": None 396 - } 397 - ] 398 - } 399 - 400 - yield f"data: {json.dumps(initial_response)}\n\n" 401 - 402 - # Stream tokens 403 - try: 404 - for token in runner.generate_streaming( 405 - prompt=prompt, 406 - max_tokens=runner.get_effective_max_tokens(request.max_tokens or _default_max_tokens, interactive=False), 407 - temperature=request.temperature, 408 - top_p=request.top_p, 409 - repetition_penalty=request.repetition_penalty, 410 - use_chat_template=False, # Already applied in _format_conversation 411 - use_chat_stop_tokens=False # Server mode shouldn't stop on chat markers 412 - ): 413 - chunk_response = { 414 - "id": completion_id, 415 - "object": "chat.completion.chunk", 416 - "created": created, 417 - "model": request.model, 418 - "choices": [ 419 - { 420 - "index": 0, 421 - "delta": {"content": token}, 422 - "finish_reason": None 423 - } 424 - ] 425 - } 426 - 427 - yield f"data: {json.dumps(chunk_response)}\n\n" 428 - 429 - # Check for stop sequences 430 - if request.stop: 431 - stop_sequences = request.stop if isinstance(request.stop, list) else [request.stop] 432 - if any(stop in token for stop in stop_sequences): 433 - break 434 - 435 - except Exception as e: 436 - error_response = { 437 - "id": completion_id, 438 - "object": "chat.completion.chunk", 439 - "created": created, 440 - "model": request.model, 441 - "choices": [ 442 - { 443 - "index": 0, 444 - "delta": {}, 445 - "finish_reason": "error" 446 - } 447 - ], 448 - "error": str(e) 449 - } 450 - yield f"data: {json.dumps(error_response)}\n\n" 451 - 452 - # Final response 453 - final_response = { 454 - "id": completion_id, 455 - "object": "chat.completion.chunk", 456 - "created": created, 457 - "model": request.model, 458 - "choices": [ 459 - { 460 - "index": 0, 461 - "delta": {}, 462 - "finish_reason": "stop" 463 - } 464 - ] 465 - } 466 - 467 - yield f"data: {json.dumps(final_response)}\n\n" 468 - yield "data: [DONE]\n\n"

+1

server/backend/linux.py

··· 1 + # Module for linux backend

+183

server/backend/mlx.py

··· 1 + from .mlx_runner import MLXRunner 2 + from ..cache_utils import get_model_path 3 + from fastapi import HTTPException 4 + from ..schemas import ChatMessage, ChatCompletionRequest, downloadRequest 5 + from ..hf_downloader import pull_model 6 + 7 + import logging 8 + import json 9 + import time 10 + import uuid 11 + from collections.abc import AsyncGenerator 12 + 13 + logger = logging.getLogger("app") 14 + 15 + from typing import Any, Dict, List, Optional, Union 16 + 17 + _model_cache: Dict[str, MLXRunner] = {} 18 + _default_max_tokens: Optional[int] = None # Use dynamic model-aware limits by default 19 + _current_model_path: Optional[str] = None 20 + 21 + 22 + def download_model(model_name: str): 23 + """Download the model""" 24 + if pull_model(model_name): 25 + return {"message": "Model downloaded"} 26 + else: 27 + raise HTTPException(status_code=400, detail="Downloading model failed") 28 + 29 + 30 + def get_or_load_model(model_spec: str, verbose: bool = False) -> MLXRunner: 31 + """Get model from cache or load it if not cached.""" 32 + global _model_cache, _current_model_path 33 + 34 + # Use the existing model path resolution from cache_utils 35 + 36 + try: 37 + model_path, model_name, commit_hash = get_model_path(model_spec) 38 + if not model_path.exists(): 39 + logger.info(f"Model {model_spec} not found in cache") 40 + raise HTTPException( 41 + status_code=404, detail=f"Model {model_spec} not found in cache" 42 + ) 43 + except Exception as e: 44 + logger.info(f"Model {model_spec} not found in: {str(e)}") 45 + raise HTTPException( 46 + status_code=404, detail=f"Model {model_spec} not found: {str(e)}" 47 + ) 48 + 49 + # Check if it's an MLX model 50 + 51 + model_path_str = str(model_path) 52 + 53 + # Check if we need to load a different model 54 + if _current_model_path != model_path_str: 55 + # Proactively clean up any previously loaded runner to release memory 56 + if _model_cache: 57 + try: 58 + for _old_runner in list(_model_cache.values()): 59 + try: 60 + _old_runner.cleanup() 61 + except Exception: 62 + pass 63 + finally: 64 + _model_cache.clear() 65 + 66 + # Load new model 67 + if verbose: 68 + print(f"Loading model: {model_name}") 69 + 70 + logger.info(f"Loading model: {model_name}") 71 + runner = MLXRunner(model_path_str, verbose=verbose) 72 + runner.load_model() 73 + 74 + _model_cache[model_path_str] = runner 75 + _current_model_path = model_path_str 76 + else: 77 + logger.info(f"Model {model_name} already in memory") 78 + 79 + return _model_cache[model_path_str] 80 + 81 + async def generate_chat_stream( 82 + messages: List[ChatMessage], request: ChatCompletionRequest 83 + ) -> AsyncGenerator[str, None]: 84 + """Generate streaming chat completion response.""" 85 + 86 + _messages = messages 87 + completion_id = f"chatcmpl-{uuid.uuid4()}" 88 + created = int(time.time()) 89 + runner = get_or_load_model(request.model) 90 + if request.chat_start: 91 + _messages.extend(request.messages) 92 + # Convert messages to dict format for runner 93 + message_dicts = format_chat_messages_for_runner(_messages) 94 + 95 + # Let the runner format with chat templates 96 + prompt = runner._format_conversation(message_dicts, use_chat_template=True) 97 + 98 + # Yield initial response 99 + initial_response = { 100 + "id": completion_id, 101 + "object": "chat.completion.chunk", 102 + "created": created, 103 + "model": request.model, 104 + "choices": [ 105 + { 106 + "index": 0, 107 + "delta": {"role": "assistant", "content": ""}, 108 + "finish_reason": None, 109 + } 110 + ], 111 + } 112 + 113 + yield f"data: {json.dumps(initial_response)}\n\n" 114 + 115 + # Stream tokens 116 + try: 117 + for token in runner.generate_streaming( 118 + prompt=prompt, 119 + max_tokens=runner.get_effective_max_tokens( 120 + request.max_tokens or _default_max_tokens, interactive=False 121 + ), 122 + temperature=request.temperature, 123 + top_p=request.top_p, 124 + repetition_penalty=request.repetition_penalty, 125 + use_chat_template=False, # Already applied in _format_conversation 126 + use_chat_stop_tokens=False, # Server mode shouldn't stop on chat markers 127 + ): 128 + chunk_response = { 129 + "id": completion_id, 130 + "object": "chat.completion.chunk", 131 + "created": created, 132 + "model": request.model, 133 + "choices": [ 134 + {"index": 0, "delta": {"content": token}, "finish_reason": None} 135 + ], 136 + } 137 + 138 + yield f"data: {json.dumps(chunk_response)}\n\n" 139 + 140 + # Check for stop sequences 141 + if request.stop: 142 + stop_sequences = ( 143 + request.stop if isinstance(request.stop, list) else [request.stop] 144 + ) 145 + if any(stop in token for stop in stop_sequences): 146 + break 147 + 148 + except Exception as e: 149 + error_response = { 150 + "id": completion_id, 151 + "object": "chat.completion.chunk", 152 + "created": created, 153 + "model": request.model, 154 + "choices": [{"index": 0, "delta": {}, "finish_reason": "error"}], 155 + "error": str(e), 156 + } 157 + yield f"data: {json.dumps(error_response)}\n\n" 158 + 159 + # Final response 160 + final_response = { 161 + "id": completion_id, 162 + "object": "chat.completion.chunk", 163 + "created": created, 164 + "model": request.model, 165 + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], 166 + } 167 + 168 + yield f"data: {json.dumps(final_response)}\n\n" 169 + yield "data: [DONE]\n\n" 170 + 171 + def format_chat_messages_for_runner( 172 + messages: List[ChatMessage], 173 + ) -> List[Dict[str, str]]: 174 + """Convert chat messages to format expected by MLXRunner. 175 + 176 + Returns messages in dict format for the runner to apply chat templates. 177 + """ 178 + return [{"role": msg.role, "content": msg.content} for msg in messages] 179 + 180 + 181 + def count_tokens(text: str) -> int: 182 + """Rough token count estimation.""" 183 + return int(len(text.split()) * 1.3) # Approximation, convert to int

+205 -109

server/cache_utils.py

··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 1 import datetime 23 2 import json 24 3 import os ··· 56 35 else: 57 36 return f"models--{hf_name}" 58 37 38 + 59 39 def cache_dir_to_hf(cache_name: str) -> str: 60 40 if cache_name.startswith("models--"): 61 - remaining = cache_name[len("models--"):] 41 + remaining = cache_name[len("models--") :] 62 42 if "--" in remaining: 63 43 parts = remaining.split("--", 1) 64 44 return f"{parts[0]}/{parts[1]}" ··· 66 46 return remaining 67 47 return cache_name 68 48 49 + 69 50 def expand_model_name(model_name): 70 51 if "/" in model_name: 71 52 return model_name ··· 74 55 if mlx_cache_dir.exists(): 75 56 return mlx_candidate 76 57 common_mlx_patterns = [ 77 - "Llama-", "Qwen", "Mistral", "Phi-", "Mixtral", "phi-", "deepseek" 58 + "Llama-", 59 + "Qwen", 60 + "Mistral", 61 + "Phi-", 62 + "Mixtral", 63 + "phi-", 64 + "deepseek", 78 65 ] 79 66 for pattern in common_mlx_patterns: 80 67 if pattern in model_name: 81 68 return f"mlx-community/{model_name}" 82 69 return model_name 83 70 71 + 84 72 def find_matching_models(pattern): 85 73 """Find models that match a partial pattern. Returns a list of (model_dir, hf_name) tuples.""" 86 74 all_models = [d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--")] 87 75 matches = [] 88 - 76 + 89 77 for model_dir in all_models: 90 78 hf_name = cache_dir_to_hf(model_dir.name) 91 79 # Check if the pattern appears in the model name (case insensitive) 92 80 if pattern.lower() in hf_name.lower(): 93 81 matches.append((model_dir, hf_name)) 94 - 82 + 95 83 return matches 96 84 85 + 97 86 def hash_exists_in_local_cache(model_name, commit_hash): 98 87 """Check if a specific commit hash exists in the local cache for a model. 99 - 88 + 100 89 Supports both full hashes and short hash prefixes (local resolution only). 101 - 90 + 102 91 Args: 103 92 model_name: Full model name (e.g., 'mlx-community/Phi-3-mini-4k-instruct-4bit') 104 93 commit_hash: Commit hash to check for (short or full) 105 - 94 + 106 95 Returns: 107 96 Full hash if exists in local cache, None otherwise 108 97 """ 109 98 base_cache_dir = MODEL_CACHE / hf_to_cache_dir(model_name) 110 99 if not base_cache_dir.exists(): 111 100 return None 112 - 101 + 113 102 snapshots_dir = base_cache_dir / "snapshots" 114 103 if not snapshots_dir.exists(): 115 104 return None 116 - 105 + 117 106 # Check for exact match first (full hash) 118 107 hash_dir = snapshots_dir / commit_hash 119 108 if hash_dir.exists(): 120 109 return commit_hash 121 - 110 + 122 111 # Check for short hash match (local resolution) 123 112 if len(commit_hash) < 40: 124 113 for snapshot_dir in snapshots_dir.iterdir(): 125 114 if snapshot_dir.is_dir() and snapshot_dir.name.startswith(commit_hash): 126 115 return snapshot_dir.name # Return full hash 127 - 116 + 128 117 return None 118 + 129 119 130 120 def resolve_single_model(model_spec): 131 121 """ ··· 135 125 """ 136 126 # Parse the model spec (handles @commit_hash syntax) 137 127 model_name, commit_hash = parse_model_spec(model_spec) 138 - 128 + 139 129 # Try exact match first 140 130 base_cache_dir = MODEL_CACHE / hf_to_cache_dir(model_name) 141 131 if base_cache_dir.exists(): 142 132 return get_model_path(model_spec) 143 - 133 + 144 134 # Extract the base name (without @commit_hash) for fuzzy matching 145 - base_spec = model_spec.split('@')[0] if '@' in model_spec else model_spec 146 - 135 + base_spec = model_spec.split("@")[0] if "@" in model_spec else model_spec 136 + 147 137 # Try fuzzy matching 148 138 matches = find_matching_models(base_spec) 149 - 139 + 150 140 if not matches: 151 141 print(f"No models found matching '{base_spec}'!") 152 142 return None, None, None ··· 165 155 if resolved_hash: 166 156 resolved_spec = f"{hf_name}@{resolved_hash}" 167 157 return get_model_path(resolved_spec) 168 - 158 + 169 159 # Hash not found in any candidate model 170 160 print(f"Hash '{commit_hash}' not found in any model matching '{base_spec}'") 171 161 print("Available models:") ··· 178 168 for _, hf_name in sorted(matches, key=lambda x: x[1]): 179 169 print(f" {hf_name}") 180 170 return None, None, None 171 + 181 172 182 173 def get_model_path(model_spec): 183 174 model_name, commit_hash = parse_model_spec(model_spec) ··· 198 189 return latest, model_name, latest.name 199 190 # Return base_cache_dir for corrupted models so rm_model can handle them 200 191 return base_cache_dir, model_name, commit_hash 192 + 201 193 202 194 def parse_model_spec(model_spec): 203 195 if "@" in model_spec: ··· 207 199 model_name = expand_model_name(model_spec) 208 200 return model_name, None 209 201 202 + 210 203 def get_model_size(model_path): 211 204 if not model_path.exists(): 212 205 return "?" ··· 220 213 return f"{total_size / 1_000_000:.1f} MB" 221 214 else: 222 215 return f"{total_size / 1_000:.1f} KB" 216 + 223 217 224 218 def get_model_modified(model_path): 225 219 if not model_path.exists(): ··· 237 231 minutes = diff.seconds // 60 238 232 return f"{minutes} minutes ago" 239 233 234 + 240 235 def detect_framework(model_path, hf_name): 241 236 """Detect model framework with lenient hints (Issue #31).""" 242 237 # 1) org hint ··· 246 241 # 2) README front matter: tags contains 'mlx' OR library_name == 'mlx' 247 242 try: 248 243 tags, pipeline, lib = read_readme_front_matter(Path(model_path)) 249 - if (lib and lib.lower() == "mlx") or (tags and any((t or '').lower() == "mlx" for t in tags)): 244 + if (lib and lib.lower() == "mlx") or ( 245 + tags and any((t or "").lower() == "mlx" for t in tags) 246 + ): 250 247 return "MLX" 251 248 except Exception: 252 249 pass ··· 261 258 has_config = any(snapshots_dir.glob("*/*.json")) 262 259 total_size = get_model_size(Path(model_path)) 263 260 try: 264 - size_mb = float(total_size.replace(" GB", "000").replace(" MB", "").replace(" KB", "0").replace(" ", "")) 261 + size_mb = float( 262 + total_size.replace(" GB", "000") 263 + .replace(" MB", "") 264 + .replace(" KB", "0") 265 + .replace(" ", "") 266 + ) 265 267 except Exception: 266 268 size_mb = 0 267 269 if has_gguf: ··· 286 288 try: 287 289 tags, pipeline, _ = read_readme_front_matter(Path(model_path)) 288 290 tset = {t.lower() for t in (tags or [])} 289 - if pipeline == "text-generation" or any(k in tset for k in {"chat", "instruct"}): 291 + if pipeline == "text-generation" or any( 292 + k in tset for k in {"chat", "instruct"} 293 + ): 290 294 return "chat" 291 - if pipeline == "sentence-similarity" or any(k in tset for k in {"embedding", "embeddings"}): 295 + if pipeline == "sentence-similarity" or any( 296 + k in tset for k in {"embedding", "embeddings"} 297 + ): 292 298 return "embedding" 293 299 except Exception: 294 300 pass ··· 314 320 except Exception: 315 321 return None 316 322 323 + 317 324 def get_model_hash(model_path): 318 325 snapshots_dir = model_path / "snapshots" 319 326 if not snapshots_dir.exists(): ··· 323 330 return "--------" 324 331 latest = max(snapshots, key=lambda x: x.stat().st_mtime) 325 332 return latest.name[:8] 333 + 326 334 327 335 def is_model_healthy(model_spec): 328 336 """Strict health check for 1.x (backport of #27 rules). ··· 361 369 # 2) Fail fast on partial/tmp markers anywhere in the snapshot 362 370 for p in model_path.rglob("*"): 363 371 name = p.name.lower() 364 - if ".partial" in name or name.endswith(".partial") or name.endswith(".tmp") or "partial" in name: 372 + if ( 373 + ".partial" in name 374 + or name.endswith(".partial") 375 + or name.endswith(".tmp") 376 + or "partial" in name 377 + ): 365 378 return False 366 379 367 380 # Helper: detect Git LFS pointer file ··· 414 427 # 4) No index present — detect multi-shard pattern 415 428 # If pattern shards exist, require index (unhealthy without index by policy parity with 2.0) 416 429 import re 430 + 417 431 shard_re = re.compile(r"model-([0-9]{5})-of-([0-9]{5})\.(safetensors|bin)") 418 432 pattern_files = [] 419 433 for f in model_path.glob("*"): ··· 426 440 return False 427 441 428 442 # 5) Single-file weights fallback (includes GGUF) 429 - weight_files = list(model_path.rglob("*.safetensors")) + list(model_path.rglob("*.bin")) + list(model_path.rglob("*.gguf")) 443 + weight_files = ( 444 + list(model_path.rglob("*.safetensors")) 445 + + list(model_path.rglob("*.bin")) 446 + + list(model_path.rglob("*.gguf")) 447 + ) 430 448 # Exclude known pattern shards from consideration (handled above) 431 449 filtered_weights = [] 432 450 for f in weight_files: ··· 444 462 ok, _ = check_lfs_corruption(model_path) 445 463 return ok 446 464 465 + 447 466 def check_lfs_corruption(model_path): 448 467 """Recursively scan for Git LFS pointer files (suspiciously small files).""" 449 468 corrupted_files = [] 450 469 for file_path in model_path.rglob("*"): 451 470 try: 452 471 if file_path.is_file() and file_path.stat().st_size < 200: 453 - with open(file_path, 'rb') as f: 472 + with open(file_path, "rb") as f: 454 473 header = f.read(200) 455 - if b'version https://git-lfs.github.com/spec/v1' in header: 474 + if b"version https://git-lfs.github.com/spec/v1" in header: 456 475 corrupted_files.append(str(file_path.relative_to(model_path))) 457 476 except Exception: 458 477 # Ignore unreadable files in corruption scan, keep conservative ··· 461 480 return False, f"LFS pointers instead of files: {', '.join(corrupted_files)}" 462 481 return True, "No LFS corruption detected" 463 482 483 + 464 484 def check_model_health(model_spec): 465 485 model_path, model_name, commit_hash = resolve_single_model(model_spec) 466 486 if not model_path: 467 487 # resolve_single_model already printed the appropriate error message 468 488 return False 469 - 489 + 470 490 print(f"Checking model: {model_name}") 471 491 if commit_hash: 472 492 print(f"Hash: {commit_hash}") 473 - 493 + 474 494 # Use the robust health check 475 495 if is_model_healthy(model_spec): 476 496 print("\n[OK] Model is healthy and usable!") ··· 478 498 else: 479 499 # Detailed diagnosis for WHY it's unhealthy 480 500 print("\n[ERROR] Model is corrupted. Detailed diagnosis:") 481 - 501 + 482 502 # Check config.json 483 503 config_path = model_path / "config.json" 484 504 if not config_path.exists(): ··· 493 513 print(" - config.json found and valid") 494 514 except (OSError, json.JSONDecodeError): 495 515 print(" - config.json exists but contains invalid JSON") 496 - 516 + 497 517 # Check weight files (including gguf support like is_model_healthy) 498 - weight_files = list(model_path.glob("*.safetensors")) + list(model_path.glob("*.bin")) + list(model_path.glob("*.gguf")) 518 + weight_files = ( 519 + list(model_path.glob("*.safetensors")) 520 + + list(model_path.glob("*.bin")) 521 + + list(model_path.glob("*.gguf")) 522 + ) 499 523 if not weight_files: 500 - weight_files = list(model_path.glob("**/*.safetensors")) + list(model_path.glob("**/*.bin")) + list(model_path.glob("**/*.gguf")) 501 - 524 + weight_files = ( 525 + list(model_path.glob("**/*.safetensors")) 526 + + list(model_path.glob("**/*.bin")) 527 + + list(model_path.glob("**/*.gguf")) 528 + ) 529 + 502 530 if weight_files: 503 531 total_size = sum(f.stat().st_size for f in weight_files) 504 532 size_mb = total_size / (1024 * 1024) 505 - print(f" - Model weights found ({len(weight_files)} files, {size_mb:.1f}MB)") 533 + print( 534 + f" - Model weights found ({len(weight_files)} files, {size_mb:.1f}MB)" 535 + ) 506 536 elif (model_path / "model.safetensors.index.json").exists(): 507 537 # Check multi-file model 508 538 try: 509 539 with open(model_path / "model.safetensors.index.json") as f: 510 540 index = json.load(f) 511 - if 'weight_map' in index: 512 - referenced_files = set(index['weight_map'].values()) 513 - existing_files = [f for f in referenced_files if (model_path / f).exists()] 541 + if "weight_map" in index: 542 + referenced_files = set(index["weight_map"].values()) 543 + existing_files = [ 544 + f for f in referenced_files if (model_path / f).exists() 545 + ] 514 546 if existing_files: 515 - total_size = sum((model_path / f).stat().st_size for f in existing_files) 547 + total_size = sum( 548 + (model_path / f).stat().st_size for f in existing_files 549 + ) 516 550 size_mb = total_size / (1024 * 1024) 517 - print(f" - Multi-file weights ({len(existing_files)}/{len(referenced_files)} files, {size_mb:.1f}MB)") 551 + print( 552 + f" - Multi-file weights ({len(existing_files)}/{len(referenced_files)} files, {size_mb:.1f}MB)" 553 + ) 518 554 if len(existing_files) < len(referenced_files): 519 555 print(" - Incomplete multi-file model") 520 556 else: 521 - print(" - Multi-file model index found but no weight files exist") 557 + print( 558 + " - Multi-file model index found but no weight files exist" 559 + ) 522 560 else: 523 561 print(" - Multi-file model index is invalid") 524 562 except Exception as e: 525 563 print(f" - Multi-file model index error: {e}") 526 564 else: 527 565 print(" - No model weights found (.safetensors, .bin, .gguf)") 528 - 566 + 529 567 # Check LFS corruption 530 568 lfs_ok, lfs_msg = check_lfs_corruption(model_path) 531 569 if not lfs_ok: 532 570 print(f" - {lfs_msg}") 533 571 else: 534 572 print(f" - {lfs_msg}") 535 - 573 + 536 574 # Show framework 537 575 framework = detect_framework(model_path.parent.parent, model_name) 538 576 print(f" - Framework: {framework}") 539 - 577 + 540 578 # Offer deletion for corrupted models 541 579 confirm = input("\nModel appears corrupted. Delete? [y/N] ") 542 580 if confirm.lower() == "y": 543 581 import errno 544 582 import shutil 583 + 545 584 try: 546 585 if commit_hash: 547 586 # Delete specific hash/snapshot ··· 559 598 print(f"Model {model_name} deleted.") 560 599 except PermissionError as e: 561 600 print(f"[ERROR] Permission denied: Cannot delete {e.filename}") 562 - print(" Try running with appropriate permissions or manually delete the directory.") 601 + print( 602 + " Try running with appropriate permissions or manually delete the directory." 603 + ) 563 604 except OSError as e: 564 605 if e.errno == errno.ENOTEMPTY: 565 606 print(f"[ERROR] Directory not empty: {e.filename}") ··· 569 610 else: 570 611 print(f"[ERROR] OS Error while deleting: {e}") 571 612 except Exception as e: 572 - print(f"[ERROR] Unexpected error while deleting: {type(e).__name__}: {e}") 573 - 613 + print( 614 + f"[ERROR] Unexpected error while deleting: {type(e).__name__}: {e}" 615 + ) 616 + 574 617 return False 618 + 575 619 576 620 def check_all_models_health(): 577 621 models = [d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--")] ··· 605 649 print(" python mlx_knife.cli health <model-name> # Show details") 606 650 return len(problematic_models) == 0 607 651 608 - def list_models(show_all=False, framework_filter=None, show_health=False, single_model=None, verbose=False): 652 + 653 + def list_models( 654 + show_all=False, 655 + framework_filter=None, 656 + show_health=False, 657 + single_model=None, 658 + verbose=False, 659 + ): 609 660 if single_model: 610 661 # Try exact match first 611 662 expanded_model = expand_model_name(single_model) ··· 616 667 else: 617 668 # If exact match fails, do partial name matching 618 669 if not MODEL_CACHE.exists(): 619 - print(f"No models found matching '{single_model}' - cache directory doesn't exist yet.") 670 + print( 671 + f"No models found matching '{single_model}' - cache directory doesn't exist yet." 672 + ) 620 673 print("Use 'mlxk pull <model-name>' to download models first.") 621 674 return 622 - all_models = [d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--")] 675 + all_models = [ 676 + d for d in MODEL_CACHE.iterdir() if d.name.startswith("models--") 677 + ] 623 678 matching_models = [] 624 - 679 + 625 680 for model_dir in all_models: 626 681 hf_name = cache_dir_to_hf(model_dir.name) 627 682 # Check if the pattern appears in the model name (case insensitive) 628 683 if single_model.lower() in hf_name.lower(): 629 684 matching_models.append(model_dir) 630 - 685 + 631 686 if not matching_models: 632 687 print(f"No models found matching '{single_model}'!") 633 688 return 634 - 689 + 635 690 models = matching_models 636 691 else: 637 692 if not MODEL_CACHE.exists(): ··· 644 699 return 645 700 if show_health: 646 701 if show_all: 647 - print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10} {'HEALTH':<8}") 702 + print( 703 + f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10} {'HEALTH':<8}" 704 + ) 648 705 else: 649 - print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'HEALTH':<8}") 706 + print( 707 + f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'HEALTH':<8}" 708 + ) 650 709 else: 651 710 if show_all: 652 - print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10}") 711 + print( 712 + f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15} {'FRAMEWORK':<10} {'TYPE':<10}" 713 + ) 653 714 else: 654 715 print(f"{'NAME':<40} {'ID':<10} {'SIZE':<10} {'MODIFIED':<15}") 655 716 for m in sorted(models, key=lambda x: x.stat().st_mtime, reverse=True): ··· 671 732 display_name = hf_name 672 733 if hf_name.startswith("mlx-community/") and not verbose: 673 734 # For MLX models, hide prefix unless verbose is set 674 - display_name = hf_name[len("mlx-community/"):] 735 + display_name = hf_name[len("mlx-community/") :] 675 736 health_status = "" 676 737 if show_health: 677 738 health_status = "[OK]" if is_model_healthy(hf_name) else "[ERR]" 678 739 if show_all: 679 - print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10} {health_status:<8}") 740 + print( 741 + f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10} {health_status:<8}" 742 + ) 680 743 else: 681 - print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {health_status:<8}") 744 + print( 745 + f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {health_status:<8}" 746 + ) 682 747 else: 683 748 if show_all: 684 - print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10}") 749 + print( 750 + f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15} {framework:<10} {model_type:<10}" 751 + ) 685 752 else: 686 753 print(f"{display_name:<40} {model_hash:<10} {size:<10} {modified:<15}") 687 754 688 - def run_model(model_spec, prompt=None, interactive=False, temperature=0.7, 689 - max_tokens=500, top_p=0.9, repetition_penalty=1.1, stream=True, 690 - use_chat_template=True, hide_reasoning=False, verbose=False): 755 + 756 + def run_model( 757 + model_spec, 758 + prompt=None, 759 + interactive=False, 760 + temperature=0.7, 761 + max_tokens=500, 762 + top_p=0.9, 763 + repetition_penalty=1.1, 764 + stream=True, 765 + use_chat_template=True, 766 + hide_reasoning=False, 767 + verbose=False, 768 + ): 691 769 """Run an MLX model with enhanced features. 692 - 770 + 693 771 Args: 694 772 model_spec: Model specification (name[@hash]) 695 773 prompt: Input prompt (if None and not interactive, enters interactive mode) ··· 730 808 ) 731 809 except ImportError: 732 810 # Fallback to subprocess if mlx_runner is not available 733 - print("[WARNING] Enhanced runner not available, falling back to subprocess mode") 811 + print( 812 + "[WARNING] Enhanced runner not available, falling back to subprocess mode" 813 + ) 734 814 print(f"Running model: {model_name}") 735 815 if commit_hash: 736 816 print(f"Hash: {commit_hash}") ··· 741 821 prompt = prompt or "Hello" 742 822 743 823 print(f"Prompt: {prompt}\n") 744 - os.system(f'python -m mlx_lm generate --model "{model_path}" --prompt "{prompt}"') 824 + os.system( 825 + f'python -m mlx_lm generate --model "{model_path}" --prompt "{prompt}"' 826 + ) 827 + 745 828 746 829 def show_model(model_spec, show_files=False, show_config=False): 747 830 """Show detailed information about a specific model.""" ··· 774 857 model_type = detect_model_type(model_path.parent.parent, model_name) 775 858 print(f"Framework: {framework}") 776 859 print(f"Type: {model_type}") 777 - 860 + 778 861 # Quantization info (if available) 779 862 quant_info = get_quantization_info(model_path) 780 863 if quant_info: ··· 787 870 main_config.append(f"{quant_info['bits']}-bit") 788 871 if "group_size" in quant_info: 789 872 main_config.append(f"group_size: {quant_info['group_size']}") 790 - 873 + 791 874 if main_config: 792 875 print(f"Quantization: {', '.join(main_config)}") 793 876 if "mode" in quant_info: 794 - print(f" Advanced mode '{quant_info['mode']}' (requires MLX ≥0.29.0, MLX-LM ≥0.27.0)") 877 + print( 878 + f" Advanced mode '{quant_info['mode']}' (requires MLX ≥0.29.0, MLX-LM ≥0.27.0)" 879 + ) 795 880 else: 796 881 print(f"Quantization: {quant_info}") 797 882 ··· 807 892 config_data = json.load(f) 808 893 809 894 # 1. Check for explicit quantization field (MLX style) 810 - if "quantization" in config_data and isinstance(config_data["quantization"], dict): 895 + if "quantization" in config_data and isinstance( 896 + config_data["quantization"], dict 897 + ): 811 898 quant = config_data["quantization"] 812 899 if "bits" in quant: 813 900 quantization_info = f"{quant['bits']}-bit" ··· 878 965 quantization_info = "Multiple GGUF variants available" 879 966 precision_info = "gguf (see variants below)" 880 967 elif len(gguf_variants) == 1: 881 - quantization_info = gguf_variants[0].split(' (')[0] 968 + quantization_info = gguf_variants[0].split(" (")[0] 882 969 precision_info = "gguf" 883 970 else: 884 971 quantization_info = "GGUF format (quantization unknown)" ··· 915 1002 if not (model_path / "config.json").exists(): 916 1003 issues.append("config.json missing") 917 1004 918 - weight_files = list(model_path.glob("*.safetensors")) + list(model_path.glob("*.bin")) + list(model_path.glob("*.gguf")) 1005 + weight_files = ( 1006 + list(model_path.glob("*.safetensors")) 1007 + + list(model_path.glob("*.bin")) 1008 + + list(model_path.glob("*.gguf")) 1009 + ) 919 1010 if not weight_files: 920 - weight_files = list(model_path.glob("**/*.safetensors")) + list(model_path.glob("**/*.bin")) + list(model_path.glob("**/*.gguf")) 1011 + weight_files = ( 1012 + list(model_path.glob("**/*.safetensors")) 1013 + + list(model_path.glob("**/*.bin")) 1014 + + list(model_path.glob("**/*.gguf")) 1015 + ) 921 1016 if not weight_files: 922 1017 index_file = model_path / "model.safetensors.index.json" 923 1018 if not index_file.exists(): ··· 974 1069 975 1070 return True 976 1071 1072 + 977 1073 def rm_model(model_spec, force=False): 978 1074 original_spec = model_spec 979 - 1075 + 980 1076 # First try to resolve using fuzzy matching 981 1077 resolved_path, resolved_name, resolved_hash = resolve_single_model(model_spec) 982 - 1078 + 983 1079 if not resolved_path: 984 1080 # resolve_single_model already printed the error message for most cases 985 1081 # But ensure we always provide feedback to the user 986 1082 print(f"Model '{original_spec}' not found or corrupted.") 987 1083 return 988 - 1084 + 989 1085 # Use the resolved model name for deletion 990 1086 model_name = resolved_name 991 1087 commit_hash = resolved_hash 992 - 993 - 1088 + 994 1089 # Confirm on auto-expansion (if the resolved name is different from input) 995 1090 base_spec = original_spec.split("@")[0] if "@" in original_spec else original_spec 996 1091 if base_spec != model_name and "/" not in base_spec: ··· 998 1093 if confirm.lower() == "n": 999 1094 print("Delete aborted.") 1000 1095 return 1001 - 1096 + 1002 1097 base_cache_dir = MODEL_CACHE / hf_to_cache_dir(model_name) 1003 1098 # This should exist since resolve_single_model succeeded, but double-check 1004 1099 if not base_cache_dir.exists(): ··· 1021 1116 else: 1022 1117 confirm = input(f"Delete hash {commit_hash} of model {model_name}? [y/N] ") 1023 1118 confirm_delete = confirm.lower() == "y" 1024 - 1119 + 1025 1120 if confirm_delete: 1026 1121 # Issue #23 Fix: Delete entire model directory, not just the snapshot 1027 1122 # This prevents the double-execution problem where refs/ remain intact 1028 1123 shutil.rmtree(base_cache_dir) 1029 1124 print(f"{model_name}@{commit_hash} deleted") 1030 - 1125 + 1031 1126 # Clean up associated lock files 1032 1127 try: 1033 1128 _cleanup_model_locks(model_name, force) ··· 1040 1135 if force: 1041 1136 confirm_delete = True 1042 1137 else: 1043 - confirm = input(f"Delete entire model {model_name} ({base_cache_dir})? [y/N] ") 1138 + confirm = input( 1139 + f"Delete entire model {model_name} ({base_cache_dir})? [y/N] " 1140 + ) 1044 1141 confirm_delete = confirm.lower() == "y" 1045 - 1142 + 1046 1143 if confirm_delete: 1047 1144 shutil.rmtree(base_cache_dir) 1048 1145 print(f"Model {model_name} completely deleted.") 1049 - 1146 + 1050 1147 # Clean up associated lock files 1051 1148 try: 1052 1149 _cleanup_model_locks(model_name, force) ··· 1058 1155 1059 1156 def _cleanup_model_locks(model_name, force=False): 1060 1157 """Clean up HuggingFace lock files for a deleted model. 1061 - 1158 + 1062 1159 Args: 1063 1160 model_name: The model name (e.g. 'microsoft/DialoGPT-small') 1064 1161 force: If True, delete without asking. If False, prompt user. 1065 1162 """ 1066 1163 locks_dir = MODEL_CACHE / ".locks" / hf_to_cache_dir(model_name) 1067 - 1164 + 1068 1165 if not locks_dir.exists(): 1069 1166 return # No locks to clean up 1070 - 1167 + 1071 1168 # Count lock files 1072 1169 try: 1073 1170 lock_files = list(locks_dir.iterdir()) 1074 1171 if not lock_files: 1075 1172 return # Empty directory 1076 - 1173 + 1077 1174 if force: 1078 1175 # Delete without asking 1079 1176 shutil.rmtree(locks_dir) ··· 1086 1183 print(f"Cache files cleaned up ({len(lock_files)} files).") 1087 1184 else: 1088 1185 print("Cache files left intact.") 1089 - 1186 + 1090 1187 except Exception as e: 1091 1188 print(f"Warning: Could not clean up cache files: {e}") 1092 -

+31 -11

server/main.py

··· 1 1 import uvicorn 2 + 3 + # from backend import linux 2 4 from .api import app 3 - from .config import PORT 5 + from .config import PORT 4 6 import logging 5 7 import sys 6 - from fastapi import Request 8 + from fastapi import Request 9 + from . import runtime 7 10 8 - # --- logging setup --- 9 11 logging.basicConfig( 10 12 level=logging.INFO, 11 13 format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ··· 14 16 logger = logging.getLogger("app") 15 17 16 18 17 - # --- middleware for request logging --- 18 19 @app.middleware("http") 19 20 async def log_requests(request: Request, call_next): 20 21 try: ··· 22 23 except Exception: 23 24 body = None 24 25 25 - logger.info({ 26 - "method": request.method, 27 - "url": str(request.url), 28 - "client": request.client.host, 29 - "body": body, 30 - }) 26 + logger.info( 27 + { 28 + "method": request.method, 29 + "url": str(request.url), 30 + "client": request.client.host, 31 + "body": body, 32 + } 33 + ) 31 34 32 35 response = await call_next(request) 33 36 logger.info(f"<-- {request.method} {request.url.path} {response.status_code}") 34 37 return response 35 38 39 + def get_backend(): 40 + """ 41 + Dynamically choose which backend should be used depending on the OS 42 + """ 43 + if sys.platform == "darwin": 44 + from .backend import mlx 45 + logger.info("Using MLX backend (MacOs)") 46 + return mlx 47 + elif sys.platform.startswith("linux"): 48 + from .backend import linux 49 + logger.info(f"Using linux backend {sys.platform}") 50 + return linux 51 + else: 52 + raise RuntimeError(f"Unsupported OS: {sys.platform}") 53 + 54 + runtime.backend = get_backend() 55 + 36 56 def run(): 37 57 uvicorn.run(app, host="127.0.0.1", port=PORT) 38 58 59 + 39 60 if __name__ == "__main__": 40 61 run() 41 -

+254 -176

server/mlx_runner.py server/backend/mlx_runner.py

··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 - 23 1 """ 24 2 Enhanced MLX model runner with direct API integration. 25 3 Provides ollama-like run experience with streaming and interactive chat. 26 4 """ 27 5 6 + import sys 28 7 import json 29 8 import os 30 9 import time ··· 32 11 from pathlib import Path 33 12 from typing import Dict, Optional 34 13 35 - import mlx.core as mx 14 + if sys.platform == "darwin": 15 + import mlx.core as mx 16 + else: 17 + mx = None 36 18 from mlx_lm import load 37 19 from mlx_lm.generate import generate_step 38 20 from mlx_lm.sample_utils import make_repetition_penalty, make_sampler 39 21 40 - from .reasoning_utils import ReasoningExtractor, StreamingReasoningParser 22 + from ..reasoning_utils import ReasoningExtractor, StreamingReasoningParser 41 23 42 24 43 25 def get_model_context_length(model_path: str) -> int: 44 26 """Extract max_position_embeddings from model config. 45 - 27 + 46 28 Args: 47 29 model_path: Path to the MLX model directory 48 - 30 + 49 31 Returns: 50 32 Maximum context length for the model (defaults to 4096 if not found) 51 33 """ 52 34 config_path = os.path.join(model_path, "config.json") 53 - 35 + 54 36 try: 55 37 with open(config_path) as f: 56 38 config = json.load(f) 57 - 39 + 58 40 # Try various common config keys for context length 59 41 context_keys = [ 60 42 "max_position_embeddings", 61 43 "n_positions", 62 44 "context_length", 63 45 "max_sequence_length", 64 - "seq_len" 46 + "seq_len", 65 47 ] 66 - 48 + 67 49 for key in context_keys: 68 50 if key in config: 69 51 return config[key] 70 - 52 + 71 53 # If no context length found, return reasonable default 72 54 return 4096 73 - 55 + 74 56 except (FileNotFoundError, json.JSONDecodeError, KeyError): 75 57 # Return default if config can't be read 76 58 return 4096 ··· 79 61 class MLXRunner: 80 62 """Direct MLX model runner with streaming and interactive capabilities.""" 81 63 82 - def __init__(self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False): 64 + def __init__( 65 + self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False 66 + ): 83 67 """Initialize the runner with a model. 84 - 68 + 85 69 Args: 86 70 model_path: Path to the MLX model directory 87 71 adapter_path: Optional path to LoRA adapter ··· 107 91 def __enter__(self): 108 92 """Context manager entry - loads the model.""" 109 93 if self._context_entered: 110 - raise RuntimeError("MLXRunner context manager cannot be entered multiple times") 111 - 94 + raise RuntimeError( 95 + "MLXRunner context manager cannot be entered multiple times" 96 + ) 97 + 112 98 self._context_entered = True 113 99 try: 114 100 self.load_model() ··· 146 132 try: 147 133 # Load model and tokenizer 148 134 self.model, self.tokenizer = load( 149 - str(self.model_path), 150 - adapter_path=self.adapter_path 135 + str(self.model_path), adapter_path=self.adapter_path 151 136 ) 152 137 153 138 load_time = time.time() - start_time ··· 156 141 157 142 if self.verbose: 158 143 print(f"Model loaded in {load_time:.1f}s") 159 - print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total") 144 + print( 145 + f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total" 146 + ) 160 147 161 148 # Extract stop tokens from tokenizer 162 149 self._extract_stop_tokens() 163 - 150 + 164 151 # Extract context length from model config 165 152 self._context_length = get_model_context_length(str(self.model_path)) 166 - 153 + 167 154 if self.verbose: 168 155 print(f"Model context length: {self._context_length} tokens") 169 - 156 + 170 157 self._model_loaded = True 171 - 158 + 172 159 except Exception as e: 173 160 # Ensure partial state is cleaned up on failure 174 161 self.model = None ··· 177 164 self._model_loaded = False 178 165 # Clear any memory that might have been allocated 179 166 mx.clear_cache() 180 - raise RuntimeError(f"Failed to load model from {self.model_path}: {e}") from e 167 + raise RuntimeError( 168 + f"Failed to load model from {self.model_path}: {e}" 169 + ) from e 181 170 182 171 def _extract_stop_tokens(self): 183 172 """Extract stop tokens from the tokenizer dynamically. 184 - 173 + 185 174 This method identifies ALL tokens that should stop generation: 186 175 1. Official EOS token from tokenizer config 187 176 2. Message-end tokens from training (e.g., <|end|> for MXFP4) 188 177 3. Common stop tokens across models 189 178 """ 190 179 self._stop_tokens = set() 191 - self._message_end_tokens = set() # Tokens that end messages but not conversations 180 + self._message_end_tokens = ( 181 + set() 182 + ) # Tokens that end messages but not conversations 192 183 193 184 # Primary source: eos_token 194 - eos_token = getattr(self.tokenizer, 'eos_token', None) 185 + eos_token = getattr(self.tokenizer, "eos_token", None) 195 186 if eos_token: 196 187 self._stop_tokens.add(eos_token) 197 188 198 189 # Also check pad_token if it's different from eos_token 199 - pad_token = getattr(self.tokenizer, 'pad_token', None) 190 + pad_token = getattr(self.tokenizer, "pad_token", None) 200 191 if pad_token and pad_token != eos_token: 201 192 self._stop_tokens.add(pad_token) 202 193 203 194 # Check additional_special_tokens 204 - if hasattr(self.tokenizer, 'additional_special_tokens'): 195 + if hasattr(self.tokenizer, "additional_special_tokens"): 205 196 for token in self.tokenizer.additional_special_tokens: 206 197 if token and isinstance(token, str): 207 198 # Only add tokens that look like stop/end tokens 208 - if any(keyword in token.lower() for keyword in ['end', 'stop', 'eot']): 199 + if any( 200 + keyword in token.lower() for keyword in ["end", "stop", "eot"] 201 + ): 209 202 self._stop_tokens.add(token) 210 - 203 + 211 204 # MLX-LM 0.27.0+: Extract tokens from added_tokens_decoder (comprehensive source) 212 - if hasattr(self.tokenizer, 'added_tokens_decoder'): 205 + if hasattr(self.tokenizer, "added_tokens_decoder"): 213 206 for _token_id, token_info in self.tokenizer.added_tokens_decoder.items(): 214 - if isinstance(token_info, dict) and 'content' in token_info: 215 - token_content = token_info['content'] 207 + if isinstance(token_info, dict) and "content" in token_info: 208 + token_content = token_info["content"] 216 209 if token_content and isinstance(token_content, str): 217 210 token_lower = token_content.lower() 218 - 211 + 219 212 # NOTE: <|end|> is NOT a stop token for MXFP4 models! 220 213 # It's a separator between reasoning and final answer 221 - if token_content == '<|end|>': 214 + if token_content == "<|end|>": 222 215 self._message_end_tokens.add(token_content) 223 216 # Do NOT add as stop token - let model continue to final answer 224 - 217 + 225 218 # Look for tokens that could be end/stop tokens 226 219 # Expanded patterns for MLX-LM 0.27.0 token varieties 227 220 # EXCLUDE <|end|> for MXFP4 models as it's a reasoning separator 228 - end_patterns = ['stop', 'eot', 'return', 'finish', 'done', 'im_end'] 221 + end_patterns = [ 222 + "stop", 223 + "eot", 224 + "return", 225 + "finish", 226 + "done", 227 + "im_end", 228 + ] 229 229 if any(pattern in token_lower for pattern in end_patterns): 230 230 # Decide if it's a message-end or conversation-end token 231 - if 'im_end' in token_lower: 231 + if "im_end" in token_lower: 232 232 self._message_end_tokens.add(token_content) 233 233 self._stop_tokens.add(token_content) 234 234 # Special handling for 'end' pattern - more selective 235 - elif 'end' in token_lower and token_content != '<|end|>': 235 + elif "end" in token_lower and token_content != "<|end|>": 236 236 # Only add non-<|end|> tokens with 'end' in them 237 237 self._stop_tokens.add(token_content) 238 - 238 + 239 239 # Special case: control tokens in |..| format 240 - elif token_content.startswith('<|') and token_content.endswith('|>'): 240 + elif token_content.startswith("<|") and token_content.endswith( 241 + "|>" 242 + ): 241 243 # Be inclusive with control tokens that might stop generation 242 - if any(pattern in token_lower for pattern in ['end', 'return', 'stop', 'finish']): 244 + if any( 245 + pattern in token_lower 246 + for pattern in ["end", "return", "stop", "finish"] 247 + ): 243 248 self._stop_tokens.add(token_content) 244 249 245 250 # Model-specific handling based on known patterns 246 251 # Use reasoning_utils for reasoning model detection and patterns 247 - from .reasoning_utils import ReasoningExtractor 248 - 249 - if hasattr(self.tokenizer, 'name_or_path'): 250 - name_or_path = str(getattr(self.tokenizer, 'name_or_path', '')).lower() 252 + from ..reasoning_utils import ReasoningExtractor 253 + 254 + if hasattr(self.tokenizer, "name_or_path"): 255 + name_or_path = str(getattr(self.tokenizer, "name_or_path", "")).lower() 251 256 model_type = ReasoningExtractor.detect_model_type(name_or_path) 252 - 257 + 253 258 if model_type: 254 259 # This is a reasoning model 255 260 self._is_reasoning_model = True 256 - 261 + 257 262 # Get patterns from reasoning_utils 258 263 if model_type in ReasoningExtractor.PATTERNS: 259 - markers = ReasoningExtractor.PATTERNS[model_type]['markers'] 260 - self._reasoning_start = markers.get('reasoning_start') 261 - self._reasoning_end = markers.get('reasoning_end') 262 - self._final_start = markers.get('final_marker') 263 - 264 + markers = ReasoningExtractor.PATTERNS[model_type]["markers"] 265 + self._reasoning_start = markers.get("reasoning_start") 266 + self._reasoning_end = markers.get("reasoning_end") 267 + self._final_start = markers.get("final_marker") 268 + 264 269 # For reasoning models, remove reasoning_end from stop tokens 265 270 if self._reasoning_end: 266 271 self._stop_tokens.discard(self._reasoning_end) 267 - 272 + 268 273 # Add proper stop token for this model type 269 - if model_type == 'gpt-oss': 270 - if '<|return|>' not in self._stop_tokens: 271 - self._stop_tokens.add('<|return|>') 274 + if model_type == "gpt-oss": 275 + if "<|return|>" not in self._stop_tokens: 276 + self._stop_tokens.add("<|return|>") 272 277 else: 273 278 self._is_reasoning_model = False 274 279 else: 275 280 self._is_reasoning_model = False 276 281 277 282 # Add common stop tokens that might not be in special tokens 278 - common_stop_tokens = {'</s>', '<|endoftext|>', '<|im_end|>', '<|eot_id|>'} 279 - 283 + common_stop_tokens = {"</s>", "<|endoftext|>", "<|im_end|>", "<|eot_id|>"} 284 + 280 285 # Add chat-specific stop tokens to prevent model self-conversations 281 286 # Based on our _format_conversation() format: "Human:" and "Assistant:" 282 287 # Also include "You:" as models might use UI-visible format 283 288 # Include single-letter variations (H:, A:, Y:) that some models use 284 289 chat_stop_tokens = { 285 - '\nHuman:', '\nAssistant:', '\nYou:', 286 - '\n\nHuman:', '\n\nAssistant:', '\n\nYou:', 287 - '\nH:', '\nA:', '\nY:', # Single-letter variations 288 - '\n\nH:', '\n\nA:', '\n\nY:' 290 + "\nHuman:", 291 + "\nAssistant:", 292 + "\nYou:", 293 + "\n\nHuman:", 294 + "\n\nAssistant:", 295 + "\n\nYou:", 296 + "\nH:", 297 + "\nA:", 298 + "\nY:", # Single-letter variations 299 + "\n\nH:", 300 + "\n\nA:", 301 + "\n\nY:", 289 302 } 290 303 291 304 # Add common stop tokens only if they decode to themselves (i.e., they're single tokens) ··· 299 312 self._stop_tokens.add(token) 300 313 except: 301 314 pass 302 - 315 + 303 316 # Store chat stop tokens separately - only used in interactive chat mode 304 317 # This prevents stopping mid-story when user asks for dialogues 305 318 self._chat_stop_tokens = list(chat_stop_tokens) ··· 320 333 321 334 def cleanup(self): 322 335 """Clean up model resources and clear GPU memory. 323 - 336 + 324 337 This method is safe to call multiple times and handles partial state cleanup. 325 338 """ 326 339 if self.verbose and self._model_loaded: ··· 342 355 343 356 # Force garbage collection and clear MLX cache 344 357 import gc 358 + 345 359 gc.collect() 346 360 try: 347 361 mx.clear_cache() ··· 350 364 351 365 if self.verbose: 352 366 memory_after = mx.get_active_memory() / 1024**3 353 - if 'memory_before' in locals(): 367 + if "memory_before" in locals(): 354 368 memory_freed = memory_before - memory_after 355 - print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)") 369 + print( 370 + f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)" 371 + ) 356 372 else: 357 373 print(f"Cleanup complete (memory after: {memory_after:.1f}GB)") 358 374 359 - def get_effective_max_tokens(self, requested_tokens: Optional[int], interactive: bool = False) -> int: 375 + def get_effective_max_tokens( 376 + self, requested_tokens: Optional[int], interactive: bool = False 377 + ) -> int: 360 378 """Get effective max tokens based on model context and usage mode. 361 - 379 + 362 380 Args: 363 381 requested_tokens: The requested max tokens (None if user didn't specify --max-tokens) 364 382 interactive: True if this is interactive mode (gets full context length) 365 - 383 + 366 384 Returns: 367 385 Effective max tokens to use 368 386 """ ··· 371 389 fallback = 4096 if interactive else 2048 372 390 if self.verbose: 373 391 if requested_tokens is None: 374 - print(f"[WARNING] Model context length unknown, using fallback: {fallback} tokens") 392 + print( 393 + f"[WARNING] Model context length unknown, using fallback: {fallback} tokens" 394 + ) 375 395 else: 376 - print(f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens") 396 + print( 397 + f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens" 398 + ) 377 399 return requested_tokens if requested_tokens is not None else fallback 378 - 400 + 379 401 if interactive: 380 402 if requested_tokens is None: 381 403 # User didn't specify --max-tokens: use full model context ··· 402 424 hide_reasoning: bool = False, 403 425 ) -> Iterator[str]: 404 426 """Generate text with streaming output. 405 - 427 + 406 428 Args: 407 429 prompt: Input prompt 408 430 max_tokens: Maximum tokens to generate ··· 413 435 use_chat_template: Apply tokenizer's chat template if available 414 436 use_chat_stop_tokens: Include chat turn markers as stop tokens (for interactive mode) 415 437 interactive: True if this is interactive mode (affects token limits) 416 - 438 + 417 439 Yields: 418 440 Generated tokens as they are produced 419 441 """ 420 442 if not self.model or not self.tokenizer: 421 443 raise RuntimeError("Model not loaded. Call load_model() first.") 422 - 444 + 423 445 # Initialize reasoning parser if this is a reasoning model 424 446 reasoning_parser = None 425 447 if self._is_reasoning_model: 426 448 model_type = ReasoningExtractor.detect_model_type( 427 - getattr(self.tokenizer, 'name_or_path', '') or '' 449 + getattr(self.tokenizer, "name_or_path", "") or "" 450 + ) 451 + reasoning_parser = StreamingReasoningParser( 452 + model_type, hide_reasoning=hide_reasoning 428 453 ) 429 - reasoning_parser = StreamingReasoningParser(model_type, hide_reasoning=hide_reasoning) 430 454 431 455 # Apply context-aware token limits 432 456 effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive) 433 457 434 458 # Apply chat template if available and requested 435 - if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template: 459 + if ( 460 + use_chat_template 461 + and hasattr(self.tokenizer, "chat_template") 462 + and self.tokenizer.chat_template 463 + ): 436 464 messages = [{"role": "user", "content": prompt}] 437 465 formatted_prompt = self.tokenizer.apply_chat_template( 438 - messages, 439 - tokenize=False, 440 - add_generation_prompt=True 466 + messages, tokenize=False, add_generation_prompt=True 441 467 ) 442 468 else: 443 469 formatted_prompt = prompt ··· 479 505 480 506 for token, _ in generator: 481 507 # Token might be an array or an int 482 - token_id = token.item() if hasattr(token, 'item') else token 508 + token_id = token.item() if hasattr(token, "item") else token 483 509 generated_tokens.append(token_id) 484 510 485 511 # Use a sliding window approach for efficiency ··· 493 519 if start_idx == 0: 494 520 # We're still within the context window 495 521 if window_text.startswith(previous_decoded): 496 - new_text = window_text[len(previous_decoded):] 522 + new_text = window_text[len(previous_decoded) :] 497 523 else: 498 524 new_text = self.tokenizer.decode([token_id]) 499 525 previous_decoded = window_text ··· 504 530 if len(window_tokens) > 1: 505 531 prefix = self.tokenizer.decode(window_tokens[:-1]) 506 532 if new_text.startswith(prefix): 507 - new_text = new_text[len(prefix):] 533 + new_text = new_text[len(prefix) :] 508 534 else: 509 535 new_text = self.tokenizer.decode([token_id]) 510 536 511 537 if new_text: 512 538 # Update accumulated response for stop token checking 513 539 accumulated_response += new_text 514 - 540 + 515 541 # Filter out stop tokens with priority: native first, then chat fallback 516 542 # Check native stop tokens FIRST in accumulated response (highest priority) 517 543 native_stop_tokens = self._stop_tokens if self._stop_tokens else [] ··· 522 548 # Calculate what text came before the stop token 523 549 text_before_stop = accumulated_response[:stop_pos] 524 550 # Calculate how much of that is new (not previously yielded) 525 - previously_yielded_length = len(accumulated_response) - len(new_text) 551 + previously_yielded_length = len(accumulated_response) - len( 552 + new_text 553 + ) 526 554 if len(text_before_stop) > previously_yielded_length: 527 555 # Yield only the new part before stop token 528 - new_part_before_stop = text_before_stop[previously_yielded_length:] 556 + new_part_before_stop = text_before_stop[ 557 + previously_yielded_length: 558 + ] 529 559 if new_part_before_stop: 530 560 if reasoning_parser: 531 561 # Process through reasoning parser for formatting 532 - for formatted_token in reasoning_parser.process_token(new_part_before_stop): 562 + for ( 563 + formatted_token 564 + ) in reasoning_parser.process_token( 565 + new_part_before_stop 566 + ): 533 567 yield formatted_token 534 568 else: 535 569 yield new_part_before_stop 536 570 return # Stop generation without yielding stop token 537 - 571 + 538 572 # Only check chat stop tokens if no native stop token found (fallback) 539 573 if use_chat_stop_tokens and self._chat_stop_tokens: 540 574 for stop_token in self._chat_stop_tokens: ··· 544 578 # Calculate what text came before the stop token 545 579 text_before_stop = accumulated_response[:stop_pos] 546 580 # Calculate how much of that is new (not previously yielded) 547 - previously_yielded_length = len(accumulated_response) - len(new_text) 581 + previously_yielded_length = len(accumulated_response) - len( 582 + new_text 583 + ) 548 584 if len(text_before_stop) > previously_yielded_length: 549 585 # Yield only the new part before stop token 550 - new_part_before_stop = text_before_stop[previously_yielded_length:] 586 + new_part_before_stop = text_before_stop[ 587 + previously_yielded_length: 588 + ] 551 589 if new_part_before_stop: 552 590 if reasoning_parser: 553 591 # Process through reasoning parser for formatting 554 - for formatted_token in reasoning_parser.process_token(new_part_before_stop): 592 + for ( 593 + formatted_token 594 + ) in reasoning_parser.process_token( 595 + new_part_before_stop 596 + ): 555 597 yield formatted_token 556 598 else: 557 599 yield new_part_before_stop ··· 574 616 # Finalize reasoning parser if used 575 617 if reasoning_parser: 576 618 yield from reasoning_parser.finalize() 577 - 619 + 578 620 # Print generation statistics if verbose 579 621 if self.verbose: 580 622 generation_time = time.time() - start_time 581 - tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0 582 - print(f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)") 623 + tokens_per_second = ( 624 + tokens_generated / generation_time if generation_time > 0 else 0 625 + ) 626 + print( 627 + f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)" 628 + ) 583 629 584 630 def generate_batch( 585 631 self, ··· 593 639 interactive: bool = False, 594 640 ) -> str: 595 641 """Generate text in batch mode (non-streaming). 596 - 642 + 597 643 Args: 598 644 prompt: Input prompt 599 645 max_tokens: Maximum tokens to generate ··· 603 649 repetition_context_size: Context size for repetition penalty 604 650 use_chat_template: Apply tokenizer's chat template if available 605 651 interactive: True if this is interactive mode (affects token limits) 606 - 652 + 607 653 Returns: 608 654 Generated text 609 655 """ ··· 614 660 effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive) 615 661 616 662 # Apply chat template if available and requested 617 - if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template: 663 + if ( 664 + use_chat_template 665 + and hasattr(self.tokenizer, "chat_template") 666 + and self.tokenizer.chat_template 667 + ): 618 668 messages = [{"role": "user", "content": prompt}] 619 669 formatted_prompt = self.tokenizer.apply_chat_template( 620 - messages, 621 - tokenize=False, 622 - add_generation_prompt=True 670 + messages, tokenize=False, add_generation_prompt=True 623 671 ) 624 672 else: 625 673 formatted_prompt = prompt ··· 654 702 655 703 for token, _ in generator: 656 704 # Token might be an array or an int 657 - token_id = token.item() if hasattr(token, 'item') else token 705 + token_id = token.item() if hasattr(token, "item") else token 658 706 generated_tokens.append(token_id) 659 707 all_tokens.append(token_id) 660 708 ··· 667 715 668 716 # Remove the prompt part 669 717 if full_response.startswith(formatted_prompt): 670 - response = full_response[len(formatted_prompt):] 718 + response = full_response[len(formatted_prompt) :] 671 719 else: 672 720 # Fallback: just decode generated tokens 673 721 response = self.tokenizer.decode(generated_tokens) 674 722 675 723 # Apply end-token filtering (same logic as streaming mode for Issue #20) 676 - response = self._filter_end_tokens_from_response(response, use_chat_stop_tokens=False) 677 - 724 + response = self._filter_end_tokens_from_response( 725 + response, use_chat_stop_tokens=False 726 + ) 727 + 678 728 # Format reasoning models output 679 729 response = self._format_reasoning_response(response) 680 730 ··· 683 733 # Count tokens for statistics 684 734 if self.verbose: 685 735 tokens_generated = len(generated_tokens) 686 - tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0 687 - print(f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)") 736 + tokens_per_second = ( 737 + tokens_generated / generation_time if generation_time > 0 else 0 738 + ) 739 + print( 740 + f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)" 741 + ) 688 742 689 743 return response 690 744 ··· 698 752 use_chat_template: bool = True, 699 753 ): 700 754 """Run an interactive chat session. 701 - 755 + 702 756 Args: 703 757 system_prompt: Optional system prompt to prepend 704 758 max_tokens: Maximum tokens per response ··· 718 772 # Get user input 719 773 user_input = input("You: ").strip() 720 774 721 - if user_input.lower() in ['exit', 'quit', 'q']: 775 + if user_input.lower() in ["exit", "quit", "q"]: 722 776 print("\nGoodbye!") 723 777 break 724 778 ··· 729 783 conversation_history.append({"role": "user", "content": user_input}) 730 784 731 785 # Format conversation for the model using chat template if available 732 - prompt = self._format_conversation(conversation_history, use_chat_template=use_chat_template) 786 + prompt = self._format_conversation( 787 + conversation_history, use_chat_template=use_chat_template 788 + ) 733 789 734 790 # Generate response with streaming 735 791 print("\nAssistant: ", end="", flush=True) ··· 751 807 752 808 # Add assistant response to history 753 809 assistant_response = "".join(response_tokens).strip() 754 - conversation_history.append({"role": "assistant", "content": assistant_response}) 810 + conversation_history.append( 811 + {"role": "assistant", "content": assistant_response} 812 + ) 755 813 756 814 print() # New line after response 757 815 ··· 762 820 print(f"\n[ERROR] {e}") 763 821 continue 764 822 765 - def _format_conversation(self, messages: list, use_chat_template: bool = True) -> str: 823 + def _format_conversation( 824 + self, messages: list, use_chat_template: bool = True 825 + ) -> str: 766 826 """Format conversation history into a prompt. 767 - 827 + 768 828 Uses the tokenizer's chat template if available, otherwise falls back 769 829 to the legacy Human:/Assistant: format for compatibility. 770 - 830 + 771 831 Args: 772 832 messages: List of message dictionaries with 'role' and 'content' 773 833 use_chat_template: Whether to use chat template if available 774 - 834 + 775 835 Returns: 776 836 Formatted conversation string 777 837 """ 778 838 # Try to use native chat template if available 779 - if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template: 839 + if ( 840 + use_chat_template 841 + and hasattr(self.tokenizer, "chat_template") 842 + and self.tokenizer.chat_template 843 + ): 780 844 try: 781 845 # Apply the tokenizer's chat template 782 846 formatted_prompt = self.tokenizer.apply_chat_template( 783 - messages, 784 - tokenize=False, 785 - add_generation_prompt=True 847 + messages, tokenize=False, add_generation_prompt=True 786 848 ) 787 849 return formatted_prompt 788 850 except Exception as e: 789 851 # If chat template fails, fall back to legacy format 790 852 if self.verbose: 791 853 print(f"[WARNING] Chat template failed, using legacy format: {e}") 792 - 854 + 793 855 # Legacy format fallback for compatibility 794 856 return self._legacy_format_conversation(messages) 795 - 857 + 796 858 def _legacy_format_conversation(self, messages: list) -> str: 797 859 """Legacy conversation formatting for backward compatibility. 798 - 860 + 799 861 This format was used in earlier versions and remains as a fallback 800 862 for models without chat templates. 801 863 """ ··· 819 881 820 882 def get_memory_usage(self) -> Dict[str, float]: 821 883 """Get current memory usage statistics. 822 - 884 + 823 885 Returns: 824 886 Dictionary with memory statistics in GB 825 887 """ ··· 834 896 return { 835 897 "current_gb": current_memory, 836 898 "peak_gb": peak_memory, 837 - "model_gb": current_memory - self._memory_baseline if self._memory_baseline else 0, 899 + "model_gb": ( 900 + current_memory - self._memory_baseline if self._memory_baseline else 0 901 + ), 838 902 } 839 903 840 904 def _format_reasoning_response(self, response: str) -> str: 841 905 """Format response from reasoning models for better readability. 842 - 906 + 843 907 For MXFP4 models that generate reasoning followed by final answer, 844 908 format it nicely for display. 845 909 """ 846 910 if not self._is_reasoning_model: 847 911 return response 848 - 912 + 849 913 # Check if response contains reasoning markers 850 914 if self._reasoning_start in response and self._final_start in response: 851 915 # Extract reasoning and final parts 852 916 try: 853 917 # Split on the reasoning start 854 918 before_reasoning, after_start = response.split(self._reasoning_start, 1) 855 - 919 + 856 920 # Find the reasoning content (until <|end|>) 857 921 if self._reasoning_end in after_start: 858 - reasoning_content, after_reasoning = after_start.split(self._reasoning_end, 1) 859 - 922 + reasoning_content, after_reasoning = after_start.split( 923 + self._reasoning_end, 1 924 + ) 925 + 860 926 # Find the final answer 861 927 if self._final_start in after_reasoning: 862 928 # Extract everything after final marker 863 929 final_parts = after_reasoning.split(self._final_start, 1) 864 930 if len(final_parts) > 1: 865 931 # Remove the <|channel|>final<|message|> marker 866 - final_answer = final_parts[1].replace('<|channel|>final<|message|>', '', 1) 867 - 932 + final_answer = final_parts[1].replace( 933 + "<|channel|>final<|message|>", "", 1 934 + ) 935 + 868 936 # Format with clear markers for parsing but minimal visual impact 869 937 formatted = [] 870 938 formatted.append("\n**[Reasoning]**\n") 871 939 formatted.append(reasoning_content.strip()) 872 940 formatted.append("\n\n---\n\n**[Answer]**\n") 873 941 formatted.append(final_answer.strip()) 874 - 875 - return '\n'.join(formatted) 942 + 943 + return "\n".join(formatted) 876 944 except Exception: 877 945 # If parsing fails, return original 878 946 pass 879 - 947 + 880 948 # Fallback: just clean up the control tokens 881 949 cleaned = response 882 - for marker in ['<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant', 883 - '<|channel|>final<|message|>', '<|return|>']: 884 - cleaned = cleaned.replace(marker, '') 885 - 950 + for marker in [ 951 + "<|channel|>analysis<|message|>", 952 + "<|end|>", 953 + "<|start|>assistant", 954 + "<|channel|>final<|message|>", 955 + "<|return|>", 956 + ]: 957 + cleaned = cleaned.replace(marker, "") 958 + 886 959 return cleaned.strip() 887 - 888 - def _filter_end_tokens_from_response(self, response: str, use_chat_stop_tokens: bool = False) -> str: 960 + 961 + def _filter_end_tokens_from_response( 962 + self, response: str, use_chat_stop_tokens: bool = False 963 + ) -> str: 889 964 """Filter end tokens from a complete response (batch mode). 890 - 965 + 891 966 This method applies the same filtering logic as the streaming mode 892 967 to ensure consistent behavior between streaming and non-streaming. 893 - 968 + 894 969 Args: 895 970 response: The complete generated response 896 971 use_chat_stop_tokens: Whether to apply chat stop tokens 897 - 972 + 898 973 Returns: 899 974 Response with end tokens filtered out 900 975 """ ··· 906 981 stop_pos = response.find(stop_token) 907 982 filtered_response = response[:stop_pos].rstrip() 908 983 if self.verbose: 909 - print(f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}") 984 + print( 985 + f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}" 986 + ) 910 987 return filtered_response 911 - 988 + 912 989 # Only check chat stop tokens if no native stop token found (fallback) 913 990 if use_chat_stop_tokens and self._chat_stop_tokens: 914 991 for stop_token in self._chat_stop_tokens: ··· 916 993 # Find the stop token position and return everything before it 917 994 stop_pos = response.find(stop_token) 918 995 return response[:stop_pos] 919 - 996 + 920 997 # No stop tokens found, return original response 921 998 return response 922 999 923 1000 924 1001 def get_gpu_status() -> Dict[str, float]: 925 1002 """Independent GPU status check - usable from anywhere. 926 - 1003 + 927 1004 Returns: 928 1005 Dictionary with GPU memory statistics in GB 929 1006 """ ··· 935 1012 936 1013 def check_memory_available(required_gb: float) -> bool: 937 1014 """Pre-flight check before model loading. 938 - 1015 + 939 1016 Args: 940 1017 required_gb: Required memory in GB 941 - 1018 + 942 1019 Returns: 943 1020 True if memory is likely available (conservative estimate) 944 1021 """ ··· 966 1043 verbose: bool = False, 967 1044 ) -> Optional[str]: 968 1045 """Enhanced run function with direct MLX integration. 969 - 1046 + 970 1047 Uses context manager pattern for automatic resource cleanup. 971 - 1048 + 972 1049 Args: 973 1050 model_path: Path to the MLX model 974 1051 prompt: Input prompt (if None, enters interactive mode) ··· 978 1055 top_p: Top-p sampling parameter 979 1056 repetition_penalty: Penalty for repeated tokens 980 1057 stream: Whether to stream output 981 - 1058 + 982 1059 Returns: 983 1060 Generated text (in non-interactive mode) 984 1061 """ ··· 1038 1115 # Show memory usage if verbose 1039 1116 if verbose: 1040 1117 memory_stats = runner.get_memory_usage() 1041 - print(f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total") 1118 + print( 1119 + f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total" 1120 + ) 1042 1121 1043 1122 return response 1044 1123 ··· 1047 1126 except Exception as e: 1048 1127 print(f"\n[ERROR] {e}") 1049 1128 return None 1050 -

+27 -43

server/model_card.py

··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 1 from __future__ import annotations 23 2 24 3 # ruff: noqa: UP045 ··· 45 24 def _latest_snapshot_dir(model_base_dir: Path) -> Optional[Path]: 46 25 """Return latest snapshot directory for a cached HF model base dir.""" 47 26 try: 48 - snaps = (model_base_dir / "snapshots") 27 + snaps = model_base_dir / "snapshots" 49 28 if not snaps.exists(): 50 29 return None 51 30 candidates = [d for d in snaps.iterdir() if d.is_dir()] ··· 73 52 """ 74 53 start = text.find("\n---\n") 75 54 # Accept files starting directly with '---' too 76 - if text.startswith('---'): 55 + if text.startswith("---"): 77 56 start = 0 78 57 elif start >= 0: 79 58 start = start + 1 # move to line start 80 59 else: 81 60 # Try at very beginning without newline 82 - start = 0 if text[:3] == '---' else -1 61 + start = 0 if text[:3] == "---" else -1 83 62 if start != 0: 84 63 return {} 85 64 86 65 # Find closing '---' after start 87 - end = text.find('\n---', 3) 66 + end = text.find("\n---", 3) 88 67 if end == -1: 89 68 return {} 90 - header = text[3:end] if text.startswith('---') else text[start + 3:end] 69 + header = text[3:end] if text.startswith("---") else text[start + 3 : end] 91 70 92 71 # Normalize lines 93 72 lines = [ln.strip() for ln in header.splitlines() if ln.strip()] ··· 103 82 list_acc = [] 104 83 105 84 for ln in lines: 106 - if ln.startswith('- '): 85 + if ln.startswith("- "): 107 86 # list item under current_key 108 - val = ln[2:].strip().strip('"\'') 87 + val = ln[2:].strip().strip("\"'") 109 88 if current_key is not None: 110 89 list_acc.append(val) 111 90 continue 112 91 # key: value or key: [a, b] 113 - if ':' in ln: 92 + if ":" in ln: 114 93 # Close any previous list 115 94 flush_list() 116 - key, val = ln.split(':', 1) 95 + key, val = ln.split(":", 1) 117 96 key = key.strip() 118 97 val = val.strip() 119 98 current_key = key ··· 122 101 data.setdefault(key, []) 123 102 continue 124 103 # Inline list [a, b] 125 - if val.startswith('[') and val.endswith(']'): 104 + if val.startswith("[") and val.endswith("]"): 126 105 inner = val[1:-1].strip() 127 - items = [] if not inner else [it.strip().strip('"\'') for it in inner.split(',')] 106 + items = ( 107 + [] 108 + if not inner 109 + else [it.strip().strip("\"'") for it in inner.split(",")] 110 + ) 128 111 data[key] = [x for x in items if x] 129 112 continue 130 113 # Scalar 131 - data[key] = val.strip('"\'') 114 + data[key] = val.strip("\"'") 132 115 continue 133 116 # Non key-value, ignore 134 117 # Flush last list ··· 136 119 return data 137 120 138 121 139 - def read_readme_front_matter(model_base_dir: Path) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]: 122 + def read_readme_front_matter( 123 + model_base_dir: Path, 124 + ) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]: 140 125 """Read README.md front matter and extract tags, pipeline_tag, library_name. 141 126 142 127 Returns (tags, pipeline_tag, library_name) with lowercase normalization where applicable. ··· 146 131 snap = _latest_snapshot_dir(model_base_dir) 147 132 if not snap: 148 133 return None, None, None 149 - readme = snap / 'README.md' 134 + readme = snap / "README.md" 150 135 if not readme.exists(): 151 136 return None, None, None 152 - text = readme.read_text(encoding='utf-8', errors='ignore') 137 + text = readme.read_text(encoding="utf-8", errors="ignore") 153 138 fm = _lenient_yaml_front_matter(text) 154 139 if not fm: 155 140 return None, None, None 156 - tags = fm.get('tags') 141 + tags = fm.get("tags") 157 142 if isinstance(tags, list): 158 143 tags = [str(t).strip().lower() for t in tags if str(t).strip()] 159 144 else: 160 145 tags = None 161 - pipeline = fm.get('pipeline_tag') 146 + pipeline = fm.get("pipeline_tag") 162 147 pipeline = str(pipeline).strip().lower() if pipeline else None 163 - lib = fm.get('library_name') 148 + lib = fm.get("library_name") 164 149 lib = str(lib).strip().lower() if lib else None 165 150 return tags, pipeline, lib 166 151 except Exception: ··· 173 158 snap = _latest_snapshot_dir(model_base_dir) 174 159 if not snap: 175 160 return False 176 - tk = snap / 'tokenizer_config.json' 161 + tk = snap / "tokenizer_config.json" 177 162 if not tk.exists(): 178 163 return False 179 - with open(tk, encoding='utf-8') as f: 164 + with open(tk, encoding="utf-8") as f: 180 165 data = json.load(f) 181 - tmpl = data.get('chat_template') 166 + tmpl = data.get("chat_template") 182 167 return bool(tmpl and isinstance(tmpl, str) and tmpl.strip()) 183 168 except Exception: 184 169 return False 185 -

+3

server/pyproject.toml

··· 14 14 [build-system] 15 15 requires = ["setuptools", "wheel"] 16 16 build-backend = "setuptools.build_meta" 17 + 18 + [tool.setuptools.packages.find] 19 + exclude = ["backend", "backend.*"]

+5

server/pyrightconfig.json

··· 1 + { 2 + "venvPath": ".", 3 + "venv": ".venv" 4 + } 5 +

+172 -165

server/reasoning_utils.py

··· 1 - # MIT License 2 - 3 - # Copyright (c) 2025 The BROKE team 🦫 4 - 5 - # Permission is hereby granted, free of charge, to any person obtaining a copy 6 - # of this software and associated documentation files (the "Software"), to deal 7 - # in the Software without restriction, including without limitation the rights 8 - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 - # copies of the Software, and to permit persons to whom the Software is 10 - # furnished to do so, subject to the following conditions: 11 - 12 - # The above copyright notice and this permission notice shall be included in all 13 - # copies or substantial portions of the Software. 14 - 15 - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 - # SOFTWARE. 22 1 """ 23 2 Utilities for handling reasoning models and their output. 24 3 ··· 35 14 36 15 class ReasoningExtractor: 37 16 """Extract reasoning and final answer from model outputs.""" 38 - 17 + 39 18 # Model-specific patterns 40 19 PATTERNS = { 41 - 'gpt-oss': { 42 - 'reasoning': r'<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>', 43 - 'final': r'<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)', 44 - 'markers': { 45 - 'reasoning_start': '<|channel|>analysis<|message|>', 46 - 'reasoning_end': '<|end|>', 47 - 'final_marker': '<|channel|>final<|message|>', 20 + "gpt-oss": { 21 + "reasoning": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>", 22 + "final": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", 23 + "markers": { 24 + "reasoning_start": "<|channel|>analysis<|message|>", 25 + "reasoning_end": "<|end|>", 26 + "final_marker": "<|channel|>final<|message|>", 48 27 # Skip tokens that appear between reasoning and final 49 - 'skip_tokens': ['<|start|>assistant<|channel|>final<|message|>', '<|start|>assistant', '<|start|>', '<|channel|>final<|message|>'], 28 + "skip_tokens": [ 29 + "<|start|>assistant<|channel|>final<|message|>", 30 + "<|start|>assistant", 31 + "<|start|>", 32 + "<|channel|>final<|message|>", 33 + ], 50 34 # Conditional skip tokens - only skip if at start of final section 51 - 'conditional_skip': ['assistant'] 52 - } 35 + "conditional_skip": ["assistant"], 36 + }, 53 37 }, 54 - 'deepseek': { 55 - 'reasoning': r'<think>(.*?)</think>', 56 - 'final': r'</think>(.*?)$', 57 - 'markers': { 58 - 'reasoning_start': '<think>', 59 - 'reasoning_end': '</think>', 60 - } 38 + "deepseek": { 39 + "reasoning": r"<think>(.*?)</think>", 40 + "final": r"</think>(.*?)$", 41 + "markers": { 42 + "reasoning_start": "<think>", 43 + "reasoning_end": "</think>", 44 + }, 61 45 }, 62 - 'claude': { 63 - 'reasoning': r'<thinking>(.*?)</thinking>', 64 - 'final': r'</thinking>(.*?)$', 65 - 'markers': { 66 - 'reasoning_start': '<thinking>', 67 - 'reasoning_end': '</thinking>', 68 - } 69 - } 46 + "claude": { 47 + "reasoning": r"<thinking>(.*?)</thinking>", 48 + "final": r"</thinking>(.*?)$", 49 + "markers": { 50 + "reasoning_start": "<thinking>", 51 + "reasoning_end": "</thinking>", 52 + }, 53 + }, 70 54 } 71 - 55 + 72 56 @classmethod 73 57 def detect_model_type(cls, model_name: str) -> Optional[str]: 74 58 """Detect reasoning model type from model name.""" 75 59 model_lower = model_name.lower() 76 - 77 - if 'gpt-oss' in model_lower: 78 - return 'gpt-oss' 79 - elif 'deepseek' in model_lower and 'r1' in model_lower: 80 - return 'deepseek' 81 - elif 'claude' in model_lower: 82 - return 'claude' 83 - elif 'qwq' in model_lower: 84 - return 'gpt-oss' # QwQ uses similar format to GPT-OSS 85 - 60 + 61 + if "gpt-oss" in model_lower: 62 + return "gpt-oss" 63 + elif "deepseek" in model_lower and "r1" in model_lower: 64 + return "deepseek" 65 + elif "claude" in model_lower: 66 + return "claude" 67 + elif "qwq" in model_lower: 68 + return "gpt-oss" # QwQ uses similar format to GPT-OSS 69 + 86 70 return None 87 - 71 + 88 72 @classmethod 89 - def extract(cls, text: str, model_type: Optional[str] = None, 90 - model_name: Optional[str] = None) -> Dict[str, Optional[str]]: 73 + def extract( 74 + cls, 75 + text: str, 76 + model_type: Optional[str] = None, 77 + model_name: Optional[str] = None, 78 + ) -> Dict[str, Optional[str]]: 91 79 """ 92 80 Extract reasoning and final answer from model output. 93 - 81 + 94 82 Args: 95 83 text: The full model output 96 84 model_type: Explicit model type ('mxfp4', 'deepseek', etc.) 97 85 model_name: Model name to auto-detect type 98 - 86 + 99 87 Returns: 100 88 Dictionary with 'reasoning', 'final_answer', and 'full_response' 101 89 """ 102 90 # Auto-detect model type if not provided 103 91 if not model_type and model_name: 104 92 model_type = cls.detect_model_type(model_name) 105 - 93 + 106 94 # If no model type detected, return text as-is 107 95 if not model_type or model_type not in cls.PATTERNS: 108 96 return { 109 - 'reasoning': None, 110 - 'final_answer': text, 111 - 'full_response': text, 112 - 'has_reasoning': False 97 + "reasoning": None, 98 + "final_answer": text, 99 + "full_response": text, 100 + "has_reasoning": False, 113 101 } 114 - 102 + 115 103 patterns = cls.PATTERNS[model_type] 116 - 104 + 117 105 # Extract reasoning 118 - reasoning_match = re.search(patterns['reasoning'], text, re.DOTALL) 106 + reasoning_match = re.search(patterns["reasoning"], text, re.DOTALL) 119 107 reasoning = reasoning_match.group(1).strip() if reasoning_match else None 120 - 108 + 121 109 # Extract final answer 122 - final_match = re.search(patterns['final'], text, re.DOTALL) 110 + final_match = re.search(patterns["final"], text, re.DOTALL) 123 111 final_answer = final_match.group(1).strip() if final_match else None 124 - 112 + 125 113 # If no final answer found but we have reasoning, 126 114 # the text after reasoning might be the answer 127 115 if reasoning and not final_answer: 128 116 # Try to find text after reasoning markers 129 - markers = patterns.get('markers', {}) 130 - if 'reasoning_end' in markers: 131 - split_text = text.split(markers['reasoning_end'], 1) 117 + markers = patterns.get("markers", {}) 118 + if "reasoning_end" in markers: 119 + split_text = text.split(markers["reasoning_end"], 1) 132 120 if len(split_text) > 1: 133 121 # Clean up any remaining markers 134 122 remaining = split_text[1] 135 123 for marker in markers.values(): 136 - remaining = remaining.replace(marker, '') 124 + remaining = remaining.replace(marker, "") 137 125 final_answer = remaining.strip() 138 - 126 + 139 127 # If still no final answer, use full text minus reasoning markers 140 128 if not final_answer: 141 129 final_answer = text 142 130 # Remove all known markers 143 131 if model_type in cls.PATTERNS: 144 - markers = cls.PATTERNS[model_type].get('markers', {}) 132 + markers = cls.PATTERNS[model_type].get("markers", {}) 145 133 for marker in markers.values(): 146 - final_answer = final_answer.replace(marker, '') 134 + final_answer = final_answer.replace(marker, "") 147 135 final_answer = final_answer.strip() 148 - 136 + 149 137 return { 150 - 'reasoning': reasoning, 151 - 'final_answer': final_answer, 152 - 'full_response': text, 153 - 'has_reasoning': bool(reasoning), 154 - 'model_type': model_type 138 + "reasoning": reasoning, 139 + "final_answer": final_answer, 140 + "full_response": text, 141 + "has_reasoning": bool(reasoning), 142 + "model_type": model_type, 155 143 } 156 - 144 + 157 145 @classmethod 158 - def format_for_display(cls, extracted: Dict[str, Optional[str]], 159 - show_reasoning: bool = False) -> str: 146 + def format_for_display( 147 + cls, extracted: Dict[str, Optional[str]], show_reasoning: bool = False 148 + ) -> str: 160 149 """ 161 150 Format extracted content for display. 162 - 151 + 163 152 Args: 164 153 extracted: Output from extract() 165 154 show_reasoning: Whether to include reasoning in output 166 - 155 + 167 156 Returns: 168 157 Formatted string for display 169 158 """ 170 - if not extracted.get('has_reasoning'): 171 - return extracted.get('final_answer', '') 172 - 159 + if not extracted.get("has_reasoning"): 160 + return extracted.get("final_answer", "") 161 + 173 162 if show_reasoning: 174 163 output = [] 175 - if extracted.get('reasoning'): 164 + if extracted.get("reasoning"): 176 165 output.append("═══ Reasoning ═══") 177 - output.append(extracted['reasoning']) 166 + output.append(extracted["reasoning"]) 178 167 output.append("\n═══ Answer ═══") 179 - output.append(extracted.get('final_answer', '')) 180 - return '\n'.join(output) 168 + output.append(extracted.get("final_answer", "")) 169 + return "\n".join(output) 181 170 else: 182 - return extracted.get('final_answer', '') 171 + return extracted.get("final_answer", "") 183 172 184 173 185 174 class StreamingReasoningHandler: 186 175 """Handle reasoning during streaming generation.""" 187 - 176 + 188 177 def __init__(self, model_type: Optional[str] = None): 189 178 self.model_type = model_type 190 179 self.buffer = "" ··· 193 182 self.in_reasoning = False 194 183 self.in_final = False 195 184 self.markers = {} 196 - 185 + 197 186 if model_type and model_type in ReasoningExtractor.PATTERNS: 198 - self.markers = ReasoningExtractor.PATTERNS[model_type].get('markers', {}) 199 - 187 + self.markers = ReasoningExtractor.PATTERNS[model_type].get("markers", {}) 188 + 200 189 def process_token(self, token: str) -> Tuple[str, bool]: 201 190 """ 202 191 Process a streaming token. 203 - 192 + 204 193 Args: 205 194 token: The new token 206 - 195 + 207 196 Returns: 208 197 (output_token, should_display) - token to output and whether to display it 209 198 """ 210 199 self.buffer += token 211 - 200 + 212 201 # Check for reasoning start 213 - if not self.in_reasoning and self.markers.get('reasoning_start'): 214 - if self.markers['reasoning_start'] in self.buffer: 202 + if not self.in_reasoning and self.markers.get("reasoning_start"): 203 + if self.markers["reasoning_start"] in self.buffer: 215 204 self.in_reasoning = True 216 - self.reasoning_buffer = self.buffer.split(self.markers['reasoning_start'])[1] 205 + self.reasoning_buffer = self.buffer.split( 206 + self.markers["reasoning_start"] 207 + )[1] 217 208 return ("", False) # Don't display reasoning start marker 218 - 209 + 219 210 # If in reasoning, buffer it 220 211 if self.in_reasoning: 221 212 self.reasoning_buffer += token 222 - 213 + 223 214 # Check for reasoning end 224 - if self.markers.get('reasoning_end') and self.markers['reasoning_end'] in self.reasoning_buffer: 215 + if ( 216 + self.markers.get("reasoning_end") 217 + and self.markers["reasoning_end"] in self.reasoning_buffer 218 + ): 225 219 self.in_reasoning = False 226 220 self.in_final = True 227 221 # Clean up reasoning buffer 228 - self.reasoning_buffer = self.reasoning_buffer.replace(self.markers['reasoning_end'], '') 222 + self.reasoning_buffer = self.reasoning_buffer.replace( 223 + self.markers["reasoning_end"], "" 224 + ) 229 225 return ("", False) # Don't display reasoning end marker 230 - 226 + 231 227 return ("", False) # Don't display reasoning content by default 232 - 228 + 233 229 # If in final answer section 234 230 if self.in_final: 235 231 # Skip final answer markers 236 - if self.markers.get('final_marker') and self.markers['final_marker'] in token: 232 + if ( 233 + self.markers.get("final_marker") 234 + and self.markers["final_marker"] in token 235 + ): 237 236 return ("", False) 238 - 237 + 239 238 self.final_buffer += token 240 239 return (token, True) # Display final answer 241 - 240 + 242 241 # Default: display token if not in special section 243 242 return (token, True) 244 243 245 244 246 245 class StreamingReasoningParser: 247 246 """Parser for real-time streaming with reasoning model formatting.""" 248 - 247 + 249 248 def __init__(self, model_type: Optional[str] = None, hide_reasoning: bool = False): 250 249 self.model_type = model_type 251 250 self.hide_reasoning = hide_reasoning ··· 253 252 self.buffer = "" 254 253 self.reasoning_content = "" 255 254 self.patterns = {} 256 - 255 + 257 256 if model_type and model_type in ReasoningExtractor.PATTERNS: 258 - self.patterns = ReasoningExtractor.PATTERNS[model_type].get('markers', {}) 259 - 257 + self.patterns = ReasoningExtractor.PATTERNS[model_type].get("markers", {}) 258 + 260 259 def process_token(self, token: str): 261 260 """ 262 261 Process a streaming token and yield formatted output. 263 - 262 + 264 263 Args: 265 264 token: New token from model 266 - 265 + 267 266 Yields: 268 267 Formatted output tokens for display 269 268 """ 270 269 self.buffer += token 271 - 270 + 272 271 # State: WAITING - looking for reasoning start 273 272 if self.state == "WAITING": 274 - reasoning_start = self.patterns.get('reasoning_start') 273 + reasoning_start = self.patterns.get("reasoning_start") 275 274 if reasoning_start and reasoning_start in self.buffer: 276 275 # Found reasoning start 277 276 before_reasoning = self.buffer.split(reasoning_start, 1)[0] 278 - 277 + 279 278 # Yield any content before reasoning (but not control tokens) 280 - if before_reasoning.strip() and not before_reasoning.strip().startswith('<|'): 279 + if before_reasoning.strip() and not before_reasoning.strip().startswith( 280 + "<|" 281 + ): 281 282 yield before_reasoning 282 - 283 + 283 284 # Start reasoning section (only if not hiding reasoning) 284 285 if not self.hide_reasoning: 285 286 yield "**[Reasoning]**\n\n" 286 - 287 + 287 288 # Switch to reasoning state 288 289 self.buffer = self.buffer.split(reasoning_start, 1)[1] 289 290 self.state = "IN_REASONING" 290 - 291 + 291 292 # Process remaining buffer recursively 292 293 if self.buffer.strip(): 293 294 yield from self.process_token("") 294 295 return 295 - 296 + 296 297 # Check if buffer might contain start of reasoning pattern 297 298 if reasoning_start: 298 299 # Check if buffer ends with partial pattern ··· 301 302 if self.buffer.endswith(reasoning_start[:i]): 302 303 has_partial_match = True 303 304 break 304 - 305 + 305 306 if has_partial_match: 306 307 # Don't yield yet - might be building up to pattern 307 308 return 308 - 309 + 309 310 # No partial match, safe to yield older content 310 311 # Keep enough buffer to detect pattern 311 312 pattern_len = len(reasoning_start) ··· 315 316 if to_yield: 316 317 yield to_yield 317 318 return 318 - 319 + 319 320 # No reasoning pattern expected or very short buffer 320 321 if not reasoning_start: 321 322 yield token 322 - 323 + 323 324 # State: IN_REASONING - collecting reasoning content 324 325 elif self.state == "IN_REASONING": 325 - reasoning_end = self.patterns.get('reasoning_end') 326 + reasoning_end = self.patterns.get("reasoning_end") 326 327 if reasoning_end and reasoning_end in self.buffer: 327 328 # Found reasoning end 328 329 reasoning_part = self.buffer.split(reasoning_end, 1)[0] 329 - 330 + 330 331 # Yield reasoning content (only if not hiding reasoning) 331 332 if reasoning_part and not self.hide_reasoning: 332 333 yield reasoning_part 333 - 334 + 334 335 # Add separator (only if not hiding reasoning) 335 336 if not self.hide_reasoning: 336 337 yield "\n\n---\n\n**[Answer]**\n\n" 337 - 338 + 338 339 # Switch to final state 339 340 self.buffer = self.buffer.split(reasoning_end, 1)[1] 340 341 self.state = "IN_FINAL" 341 - self._final_content_started = False # Track if we've started outputting final content 342 - 342 + self._final_content_started = ( 343 + False # Track if we've started outputting final content 344 + ) 345 + 343 346 # Skip intermediate control tokens 344 - skip_tokens = self.patterns.get('skip_tokens', []) 347 + skip_tokens = self.patterns.get("skip_tokens", []) 345 348 for skip_token in skip_tokens: 346 - self.buffer = self.buffer.replace(skip_token, '') 347 - 349 + self.buffer = self.buffer.replace(skip_token, "") 350 + 348 351 # Skip final marker when we find it 349 - final_marker = self.patterns.get('final_marker') 352 + final_marker = self.patterns.get("final_marker") 350 353 if final_marker and final_marker in self.buffer: 351 354 self.buffer = self.buffer.split(final_marker, 1)[1] 352 - 355 + 353 356 # Process remaining buffer 354 357 if self.buffer.strip(): 355 358 yield from self.process_token("") 356 359 return 357 - 360 + 358 361 # Still in reasoning, yield the content (only if not hiding reasoning) 359 362 if not self.hide_reasoning: 360 363 yield token 361 - 364 + 362 365 # State: IN_FINAL - normal streaming of final answer 363 366 elif self.state == "IN_FINAL": 364 367 # Check for control tokens from patterns that should be filtered 365 - skip_tokens = self.patterns.get('skip_tokens', []) 366 - conditional_skip = self.patterns.get('conditional_skip', []) 367 - 368 + skip_tokens = self.patterns.get("skip_tokens", []) 369 + conditional_skip = self.patterns.get("conditional_skip", []) 370 + 368 371 # Check if buffer contains any skip tokens and filter them out 369 372 for skip_token in skip_tokens: 370 373 if skip_token in self.buffer: 371 374 # Remove the skip token and continue 372 - self.buffer = self.buffer.replace(skip_token, '') 375 + self.buffer = self.buffer.replace(skip_token, "") 373 376 # Process remaining buffer if any 374 377 if self.buffer.strip(): 375 378 yield from self.process_token("") 376 379 return 377 - 380 + 378 381 # Check for final marker and filter it too 379 - final_marker = self.patterns.get('final_marker') 382 + final_marker = self.patterns.get("final_marker") 380 383 if final_marker and final_marker in self.buffer: 381 384 # Split at final marker and yield only content after it 382 385 parts = self.buffer.split(final_marker, 1) ··· 388 391 else: 389 392 # Just the marker itself, skip it 390 393 return 391 - 394 + 392 395 # Check conditional skip tokens - only at start of final section 393 - if not getattr(self, '_final_content_started', False): 396 + if not getattr(self, "_final_content_started", False): 394 397 for cond_token in conditional_skip: 395 398 if token.strip() == cond_token: 396 399 # Skip this token at the beginning of final section 397 400 return 398 401 # Mark that final content has started after first non-conditional token 399 - if token.strip() and not any(token.strip() == ct for ct in conditional_skip): 402 + if token.strip() and not any( 403 + token.strip() == ct for ct in conditional_skip 404 + ): 400 405 self._final_content_started = True 401 - 406 + 402 407 # Check if we might be building up to a skip token - be conservative 403 408 potential_skip = False 404 409 for skip_token in skip_tokens: 405 - if skip_token.startswith(token) or any(skip_token.startswith(self.buffer[-i:]) for i in range(1, min(len(skip_token), len(self.buffer)) + 1)): 410 + if skip_token.startswith(token) or any( 411 + skip_token.startswith(self.buffer[-i:]) 412 + for i in range(1, min(len(skip_token), len(self.buffer)) + 1) 413 + ): 406 414 potential_skip = True 407 415 break 408 - 416 + 409 417 if potential_skip: 410 418 # Don't yield yet, might be building up to a skip token 411 419 return 412 - 420 + 413 421 # Normal token in final answer - safe to yield 414 422 yield token 415 - 423 + 416 424 def finalize(self): 417 425 """ 418 426 Finalize parsing and yield any remaining buffer content. ··· 428 436 elif self.state == "IN_FINAL": 429 437 # Final answer content 430 438 yield self.buffer 431 -

+1

server/runtime.py

··· 1 + backend = None

+65

server/schemas.py

··· 1 + from pydantic import BaseModel, Field 2 + from typing import Any, Dict, List, Optional, Union 3 + 4 + class CompletionRequest(BaseModel): 5 + model: str 6 + prompt: Union[str, List[str]] 7 + max_tokens: Optional[int] = None 8 + temperature: Optional[float] = 0.7 9 + top_p: Optional[float] = 0.9 10 + stream: Optional[bool] = False 11 + stop: Optional[Union[str, List[str]]] = None 12 + repetition_penalty: Optional[float] = 1.1 13 + 14 + 15 + class ChatMessage(BaseModel): 16 + role: str = Field(..., pattern="^(system|user|assistant)$") 17 + content: str 18 + 19 + 20 + class ChatCompletionRequest(BaseModel): 21 + model: str 22 + messages: List[ChatMessage] 23 + chat_start: bool 24 + python_code: str 25 + max_tokens: Optional[int] = None 26 + temperature: Optional[float] = 0.7 27 + top_p: Optional[float] = 0.9 28 + stream: Optional[bool] = False 29 + stop: Optional[Union[str, List[str]]] = None 30 + repetition_penalty: Optional[float] = 1.1 31 + 32 + 33 + class CompletionResponse(BaseModel): 34 + id: str 35 + object: str = "text_completion" 36 + created: int 37 + model: str 38 + choices: List[Dict[str, Any]] 39 + usage: Dict[str, int] 40 + 41 + 42 + class ChatCompletionResponse(BaseModel): 43 + id: str 44 + object: str = "chat.completion" 45 + created: int 46 + model: str 47 + choices: List[Dict[str, Any]] 48 + # usage: Dict[str, int] 49 + 50 + 51 + class ModelInfo(BaseModel): 52 + id: str 53 + object: str = "model" 54 + owned_by: str = "mlx-knife" 55 + permission: List = [] 56 + context_length: Optional[int] = None 57 + 58 + 59 + class StartRequest(BaseModel): 60 + model: str 61 + memory_path: str 62 + 63 + 64 + class downloadRequest(BaseModel): 65 + model: str

-41

src/commands/mod.rs

··· 1 - // Module that handles CLI commands 2 - 3 - use anyhow::Result; 4 - use tiles::{ 5 - core::{ 6 - health, 7 - modelfile::{self, Modelfile}, 8 - }, 9 - runner::mlx, 10 - }; 11 - 12 - const DEFAULT_MODELFILE: &str = " 13 - FROM driaforall/mem-agent-mlx-4bit 14 - "; 15 - 16 - pub async fn run(modelfile: Option<String>) { 17 - let modelfile_parse_result: Result<Modelfile, String> = if let Some(modelfile_str) = modelfile { 18 - modelfile::parse_from_file(modelfile_str.as_str()) 19 - } else { 20 - modelfile::parse(DEFAULT_MODELFILE) 21 - }; 22 - 23 - match modelfile_parse_result { 24 - Ok(modelfile) => { 25 - mlx::run(modelfile).await; 26 - } 27 - Err(err) => println!("{}", err), 28 - } 29 - } 30 - 31 - pub fn check_health() { 32 - health::check_health(); 33 - } 34 - 35 - pub async fn start_server() { 36 - let _ = mlx::start_server_daemon().await; 37 - } 38 - 39 - pub fn stop_server() { 40 - let _ = mlx::stop_server_daemon(); 41 - }

src/core/health.rs tiles/src/core/health.rs

-2

src/core/mod.rs

··· 1 - pub mod health; 2 - pub mod modelfile;

src/core/modelfile.rs tilekit/src/modelfile.rs

+1 -1

src/lib.rs tiles/src/lib.rs

··· 1 1 pub mod core; 2 - pub mod runner; 2 + pub mod runtime; 3 3 4 4 #[cfg(test)] 5 5 mod tests {}

+6 -4

src/main.rs tiles/src/main.rs

··· 1 1 use std::error::Error; 2 2 3 3 use clap::{Args, Parser, Subcommand}; 4 + use tiles::runtime::build_runtime; 4 5 mod commands; 5 6 #[derive(Debug, Parser)] 6 7 #[command(name = "tiles")] ··· 38 39 /// Stops the daemon py server 39 40 Stop, 40 41 } 41 - #[tokio::main] 42 + #[tokio::main(flavor = "current_thread")] 42 43 pub async fn main() -> Result<(), Box<dyn Error>> { 43 44 let cli = Cli::parse(); 45 + let runtime = build_runtime(); 44 46 match cli.command { 45 47 Commands::Run { modelfile_path } => { 46 - commands::run(modelfile_path).await; 48 + commands::run(&runtime, modelfile_path).await; 47 49 } 48 50 Commands::Health => { 49 51 commands::check_health(); 50 52 } 51 53 Commands::Server(server) => match server.command { 52 - Some(ServerCommands::Start) => commands::start_server().await, 53 - Some(ServerCommands::Stop) => commands::stop_server(), 54 + Some(ServerCommands::Start) => commands::start_server(&runtime).await, 55 + Some(ServerCommands::Stop) => commands::stop_server(&runtime).await, 54 56 _ => println!("Expected start or stop"), 55 57 }, 56 58 }

+85 -77

src/runner/mlx.rs tiles/src/runtime/mlx.rs

··· 1 - use crate::core::modelfile::Modelfile; 2 1 use anyhow::{Context, Result}; 3 2 use futures_util::StreamExt; 4 3 use owo_colors::OwoColorize; ··· 11 10 use std::time::Duration; 12 11 use std::{env, fs}; 13 12 use std::{io, process::Command}; 13 + use tilekit::modelfile::Modelfile; 14 14 use tokio::time::sleep; 15 + 16 + pub struct MLXRuntime {} 17 + 18 + impl MLXRuntime {} 15 19 pub struct ChatResponse { 16 20 // think: String, 17 21 reply: String, 18 22 code: String, 19 23 } 20 24 21 - pub async fn run(modelfile: Modelfile) { 22 - let model = modelfile.from.as_ref().unwrap(); 23 - if model.starts_with("driaforall/mem-agent") { 24 - let _res = run_model_with_server(modelfile).await; 25 - } else { 26 - run_model_by_sub_process(modelfile); 25 + impl Default for MLXRuntime { 26 + fn default() -> Self { 27 + Self::new() 28 + } 29 + } 30 + 31 + impl MLXRuntime { 32 + pub fn new() -> Self { 33 + MLXRuntime {} 34 + } 35 + 36 + pub async fn run(&self, run_args: super::RunArgs) { 37 + let model = run_args.modelfile.from.as_ref().unwrap(); 38 + if model.starts_with("driaforall/mem-agent") { 39 + let _res = run_model_with_server(self, run_args.modelfile).await; 40 + } else { 41 + run_model_by_sub_process(run_args.modelfile); 42 + } 43 + } 44 + 45 + #[allow(clippy::zombie_processes)] 46 + pub async fn start_server_daemon(&self) -> Result<()> { 47 + // check if the server is running 48 + // start server as a child process 49 + // save the pid in a file under ~/.config/tiles/server_pid 50 + 51 + if (ping().await).is_ok() { 52 + println!("server is already up"); 53 + return Ok(()); 54 + } 55 + 56 + let config_dir = get_config_dir()?; 57 + let mut server_dir = get_server_dir()?; 58 + let pid_file = config_dir.join("server.pid"); 59 + fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 60 + 61 + let stdout_log = File::create(config_dir.join("server.out.log"))?; 62 + let stderr_log = File::create(config_dir.join("server.err.log"))?; 63 + let server_path = server_dir.join(".venv/bin/python3"); 64 + server_dir.pop(); 65 + let child = Command::new(server_path) 66 + .args(["-m", "server.main"]) 67 + .current_dir(server_dir) 68 + .stdin(Stdio::null()) 69 + .stdout(Stdio::from(stdout_log)) 70 + .stderr(Stdio::from(stderr_log)) 71 + .spawn() 72 + .expect("failed to start server"); 73 + 74 + fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 75 + std::fs::write(pid_file, child.id().to_string()).unwrap(); 76 + println!("Server started with PID {}", child.id()); 77 + Ok(()) 78 + } 79 + 80 + pub async fn stop_server_daemon(&self) -> Result<()> { 81 + if (ping().await).is_err() { 82 + println!("Server is not running"); 83 + return Ok(()); 84 + } 85 + let pid_file = get_config_dir()?.join("server.pid"); 86 + 87 + if !pid_file.exists() { 88 + eprintln!("server pid doesnt exist"); 89 + return Ok(()); 90 + } 91 + 92 + let pid = std::fs::read_to_string(&pid_file).unwrap(); 93 + Command::new("kill").arg(pid.trim()).status().unwrap(); 94 + std::fs::remove_file(pid_file).unwrap(); 95 + println!("Server stopped."); 96 + Ok(()) 27 97 } 28 98 } 29 99 ··· 82 152 } 83 153 } 84 154 85 - #[allow(clippy::zombie_processes)] 86 - pub async fn start_server_daemon() -> Result<()> { 87 - // check if the server is running 88 - // start server as a child process 89 - // save the pid in a file under ~/.config/tiles/server_pid 90 - 91 - if (ping().await).is_ok() { 92 - println!("server is already up"); 93 - return Ok(()); 94 - } 95 - 96 - let config_dir = get_config_dir()?; 97 - let server_dir = get_server_dir()?; 98 - let pid_file = config_dir.join("server.pid"); 99 - fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 100 - 101 - let stdout_log = File::create(config_dir.join("server.out.log"))?; 102 - let stderr_log = File::create(config_dir.join("server.err.log"))?; 103 - let child = Command::new("uv") 104 - .args([ 105 - "run", 106 - "--project", 107 - server_dir.to_str().unwrap(), 108 - "python", 109 - "-m", 110 - "server.main", 111 - ]) 112 - .stdin(Stdio::null()) 113 - .stdout(Stdio::from(stdout_log)) 114 - .stderr(Stdio::from(stderr_log)) 115 - .spawn() 116 - .expect("failed to start server"); 117 - 118 - fs::create_dir_all(&config_dir).context("Failed to create config directory")?; 119 - std::fs::write(pid_file, child.id().to_string()).unwrap(); 120 - println!("Server started with PID {}", child.id()); 121 - Ok(()) 122 - } 123 - 124 - pub fn stop_server_daemon() -> Result<()> { 125 - let pid_file = get_config_dir()?.join("server.pid"); 126 - 127 - if !pid_file.exists() { 128 - eprintln!("Server is not running"); 129 - return Ok(()); 130 - } 131 - 132 - let pid = std::fs::read_to_string(&pid_file).unwrap(); 133 - Command::new("kill").arg(pid.trim()).status().unwrap(); 134 - std::fs::remove_file(pid_file).unwrap(); 135 - println!("Server stopped."); 136 - Ok(()) 137 - } 138 - async fn run_model_with_server(modelfile: Modelfile) -> reqwest::Result<()> { 155 + async fn run_model_with_server( 156 + mlx_runtime: &MLXRuntime, 157 + modelfile: Modelfile, 158 + ) -> reqwest::Result<()> { 139 159 if !cfg!(debug_assertions) { 140 - let _res = start_server_daemon().await; 160 + let _res = mlx_runtime.start_server_daemon().await; 141 161 let _ = wait_until_server_is_up().await; 142 162 } 143 163 let stdin = io::stdin(); ··· 159 179 match input { 160 180 "exit" => { 161 181 println!("Exiting interactive mode"); 182 + if !cfg!(debug_assertions) { 183 + let _res = mlx_runtime.stop_server_daemon().await; 184 + } 162 185 break; 163 186 } 164 187 _ => { ··· 270 293 271 294 let mut stream = res.bytes_stream(); 272 295 let mut accumulated = String::new(); 273 - // let mut inside_python = false; 274 - // let mut tag_buffer = String::new(); 275 296 println!(); 276 297 while let Some(chunk) = stream.next().await { 277 298 let chunk = chunk.unwrap(); ··· 296 317 } 297 318 } 298 319 } 299 - // println!("{:?}", res); 300 - // if res.status() == 200 { 301 - // let text = res.text().await.unwrap(); 302 - // let v: Value = serde_json::from_str(&text).unwrap(); 303 - // let content = v["choices"][0]["message"]["content"] 304 - // .as_str() 305 - // .unwrap_or("<no content>"); 306 - 307 - // // Ok(convert_to_chat_response(content)) 308 - // } else { 309 - // // Err(String::from("request failed")) 310 - // } 311 - // unimplemented!() 312 320 Err(String::from("request failed")) 313 321 } 314 322

-1

src/runner/mod.rs

··· 1 - pub mod mlx;

+7

tilekit/Cargo.toml

··· 1 + [package] 2 + name = "tilekit" 3 + version = "0.1.0" 4 + edition = "2024" 5 + 6 + [dependencies] 7 + nom = "8"

+1

tilekit/src/lib.rs

··· 1 + pub mod modelfile;

+16

tiles/Cargo.toml

··· 1 + [package] 2 + name = "tiles" 3 + version = "0.3.0" 4 + edition = "2024" 5 + 6 + [dependencies] 7 + tilekit = {path = "../tilekit"} 8 + clap = { version = "4.5.48", features = ["derive"] } 9 + reqwest = { version = "0.12", features = ["json", "blocking", "stream"] } 10 + serde = { version = "1.0", features = ["derive"] } 11 + serde_json = "1.0" 12 + anyhow = "1.0" 13 + tokio = { version = "1" , features = ["macros", "rt-multi-thread"]} 14 + owo-colors = "4" 15 + futures-util = "0.3" 16 +

+36

tiles/src/commands/mod.rs

··· 1 + // Module that handles CLI commands 2 + 3 + use anyhow::Result; 4 + use tilekit::{modelfile, modelfile::Modelfile}; 5 + use tiles::runtime::Runtime; 6 + use tiles::{core::health, runtime::RunArgs}; 7 + const DEFAULT_MODELFILE: &str = " 8 + FROM driaforall/mem-agent-mlx-4bit 9 + "; 10 + 11 + pub async fn run(runtime: &Runtime, modelfile: Option<String>) { 12 + let modelfile_parse_result: Result<Modelfile, String> = if let Some(modelfile_str) = modelfile { 13 + modelfile::parse_from_file(modelfile_str.as_str()) 14 + } else { 15 + modelfile::parse(DEFAULT_MODELFILE) 16 + }; 17 + match modelfile_parse_result { 18 + Ok(modelfile) => { 19 + let run_args = RunArgs { modelfile }; 20 + runtime.run(run_args).await; 21 + } 22 + Err(_err) => println!("Invalid Modelfile"), 23 + } 24 + } 25 + 26 + pub fn check_health() { 27 + health::check_health(); 28 + } 29 + 30 + pub async fn start_server(runtime: &Runtime) { 31 + let _ = runtime.start_server_daemon().await; 32 + } 33 + 34 + pub async fn stop_server(runtime: &Runtime) { 35 + let _ = runtime.stop_server_daemon().await; 36 + }

+1

tiles/src/core/mod.rs

··· 1 + pub mod health;

+26

tiles/src/runtime/cpu.rs

··· 1 + use anyhow::Result; 2 + 3 + pub struct CPURuntime {} 4 + 5 + impl Default for CPURuntime { 6 + fn default() -> Self { 7 + Self::new() 8 + } 9 + } 10 + 11 + impl CPURuntime { 12 + pub fn new() -> Self { 13 + CPURuntime {} 14 + } 15 + pub async fn run(&self, _run_args: super::RunArgs) { 16 + unimplemented!() 17 + } 18 + 19 + pub async fn start_server_daemon(&self) -> Result<()> { 20 + unimplemented!() 21 + } 22 + 23 + pub async fn stop_server_daemon(&self) -> Result<()> { 24 + unimplemented!() 25 + } 26 + }

+49

tiles/src/runtime/mod.rs

··· 1 + #[allow(unused_imports)] 2 + use crate::runtime::cpu::CPURuntime; 3 + use crate::runtime::mlx::MLXRuntime; 4 + use anyhow::Result; 5 + use tilekit::modelfile::Modelfile; 6 + pub mod cpu; 7 + pub mod mlx; 8 + 9 + pub struct RunArgs { 10 + pub modelfile: Modelfile, 11 + } 12 + 13 + pub enum Runtime { 14 + Mlx(MLXRuntime), 15 + Cpu(CPURuntime), 16 + } 17 + 18 + impl Runtime { 19 + pub async fn run(&self, run_args: RunArgs) { 20 + match self { 21 + Runtime::Mlx(runtime) => runtime.run(run_args).await, 22 + Runtime::Cpu(runtime) => runtime.run(run_args).await, 23 + } 24 + } 25 + 26 + pub async fn start_server_daemon(&self) -> Result<()> { 27 + match self { 28 + Runtime::Mlx(runtime) => runtime.start_server_daemon().await, 29 + Runtime::Cpu(runtime) => runtime.start_server_daemon().await, 30 + } 31 + } 32 + 33 + pub async fn stop_server_daemon(&self) -> Result<()> { 34 + match self { 35 + Runtime::Mlx(runtime) => runtime.stop_server_daemon().await, 36 + Runtime::Cpu(runtime) => runtime.stop_server_daemon().await, 37 + } 38 + } 39 + } 40 + 41 + #[cfg(target_os = "macos")] 42 + pub fn build_runtime() -> Runtime { 43 + Runtime::Mlx(MLXRuntime::new()) 44 + } 45 + 46 + #[cfg(not(target_os = "macos"))] 47 + pub fn build_runtime() -> Runtime { 48 + Runtime::Cpu(CPURuntime::new()) 49 + }

Configure Feed

Configure Feed