diff --git a/docs/lemonade/README.md b/docs/lemonade/README.md index d5afff83..92e0a28d 100644 --- a/docs/lemonade/README.md +++ b/docs/lemonade/README.md @@ -83,12 +83,12 @@ To prompt your LLM, try one of the following: OGA iGPU: ```bash - lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are" + lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are" -t ``` Hugging Face: ```bash - lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are" + lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are" -t ``` The LLM will run with your provided prompt, and the LLM's response to your prompt will be printed to the screen. You can replace the `"Hello, my thoughts are"` with any prompt you like. @@ -97,6 +97,9 @@ You can also replace the `facebook/opt-125m` with any Hugging Face checkpoint yo You can also set the `--device` argument in `oga-load` and `huggingface-load` to load your LLM on a different device. +The `-t` (or `--template`) flag instructs lemonade to insert the prompt string into the model's chat template. +This typically results in the model returning a higher quality response. + Run `lemonade huggingface-load -h` and `lemonade llm-prompt -h` to learn more about these tools. ## Accuracy diff --git a/examples/lemonade/server/continue.md b/examples/lemonade/server/continue.md index 176b674d..1ff9450a 100644 --- a/examples/lemonade/server/continue.md +++ b/examples/lemonade/server/continue.md @@ -31,22 +31,23 @@ This will add a Continue tab to your VS Code Activity Bar. > Note: The following instructions are based on instructions from Continue found [here](https://docs.continue.dev/customize/model-providers/openai#openai-compatible-servers--apis) 1. Open the Continue tab in your VS Code Activity Bar. -1. Click the gear icon at the top to open Settings. -1. Under "Configuration", click "Open Config File". -1. Replace the "models" key in the `config.json` with the following and save: - -```json - "models": [ - { - "title": "Lemonade", - "provider": "openai", - "model": "Qwen-1.5-7B-Chat-Hybrid", - "apiKey": "-", - "apiBase": "http://localhost:8000/api/v0" - } - ], +1. Click the chat box. Some buttons will appear at the bottom of the box, including `Select model`. +1. Click `Select model`, then `+ Add Chat model` to open the new model dialog box. +1. Click the `config file` link at the very bottom of the dialog to open `config.yaml`. +1. Replace the "models" key in the `config.yaml` with the following and save: + +```yaml +models: + - name: Lemonade + provider: openai + model: Qwen-1.5-7B-Chat-Hybrid + apiBase: http://localhost:8000/api/v0 + apiKey: none ``` +6. Close the dialog box. +7. Click the chat box again. You should see `Lemonade` where you used to see `Select model`. Ready! + ## Usage > Note: see the Continue [user guide](https://docs.continue.dev/) to learn about all of their features. diff --git a/installer/Installer.nsi b/installer/Installer.nsi index 575c7130..02b182de 100644 --- a/installer/Installer.nsi +++ b/installer/Installer.nsi @@ -16,7 +16,6 @@ OutFile "Lemonade_Server_Installer.exe" Var LogHandle Var LEMONADE_SERVER_STRING -Var LEMONADE_CONDA_ENV Var HYBRID_SELECTED Var HYBRID_CLI_OPTION @@ -54,9 +53,6 @@ SectionIn RO ; Read only, always installed remove_dir: ; Try to remove directory and verify it was successful - - ; Attempt conda remove of the env, to help speed things up - ExecWait 'conda env remove -yp "$INSTDIR\$LEMONADE_CONDA_ENV"' ; Delete all remaining files RMDir /r "$INSTDIR" @@ -103,97 +99,34 @@ SectionIn RO ; Read only, always installed DetailPrint "- Packaged repo" - ; Check if conda is available - ExecWait 'where conda' $2 - DetailPrint "- Checked if conda is available" - - ; If conda is not found, show a message - ; Otherwise, continue with the installation - StrCmp $2 "0" create_env conda_not_available - - conda_not_available: - DetailPrint "- Conda not installed." - ${IfNot} ${Silent} - MessageBox MB_YESNO "Conda is not installed. Would you like to install Miniconda?" IDYES install_miniconda IDNO exit_installer - ${Else} - Goto install_miniconda - ${EndIf} - - exit_installer: - DetailPrint "- Something went wrong. Exiting installer" - Quit - - install_miniconda: - DetailPrint "-------------" - DetailPrint "- Miniconda -" - DetailPrint "-------------" - DetailPrint "- Downloading Miniconda installer..." - ExecWait 'curl -s -o "$TEMP\Miniconda3-latest-Windows-x86_64.exe" "https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe"' - - ; Install Miniconda silently - ExecWait '"$TEMP\Miniconda3-latest-Windows-x86_64.exe" /InstallationType=JustMe /AddToPath=1 /RegisterPython=0 /S /D=$PROFILE\miniconda3' $2 - ; Check if Miniconda installation was successful - ${If} $2 == 0 - DetailPrint "- Miniconda installation successful" - ${IfNot} ${Silent} - MessageBox MB_OK "Miniconda has been successfully installed." - ${EndIf} - - StrCpy $R1 "$PROFILE\miniconda3\Scripts\conda.exe" - Goto create_env - - ${Else} - DetailPrint "- Miniconda installation failed" - ${IfNot} ${Silent} - MessageBox MB_OK "Error: Miniconda installation failed. Installation will be aborted." - ${EndIf} - Goto exit_installer - ${EndIf} - - create_env: - DetailPrint "---------------------" - DetailPrint "- Conda Environment -" - DetailPrint "---------------------" - - DetailPrint "- Initializing conda..." - ; Use the appropriate conda executable - ${If} $R1 == "" - StrCpy $R1 "conda" - ${EndIf} - ; Initialize conda (needed for systems where conda was previously installed but not initialized) - nsExec::ExecToStack '"$R1" init' - - DetailPrint "- Creating a Python 3.10 environment named '$LEMONADE_CONDA_ENV' in the installation directory: $INSTDIR..." - ExecWait '"$R1" create -p "$INSTDIR\$LEMONADE_CONDA_ENV" python=3.10 -y' $R0 - - ; Check if the environment creation was successful (exit code should be 0) - StrCmp $R0 0 install_lemonade env_creation_failed - - env_creation_failed: - DetailPrint "- ERROR: Environment creation failed" - ; Display an error message and exit - ${IfNot} ${Silent} - MessageBox MB_OK "ERROR: Failed to create the Python environment. Installation will be aborted." - ${EndIf} - Quit + DetailPrint "Set up Python" + CreateDirectory "$INSTDIR\python" + ExecWait 'curl -s -o "$INSTDIR\python\python.zip" "https://www.python.org/ftp/python/3.10.9/python-3.10.9-embed-amd64.zip"' + ExecWait 'tar -xf "$INSTDIR\python\python.zip" -C "$INSTDIR\python"' + ExecWait 'curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py' + ExecWait '$INSTDIR\python\python.exe get-pip.py --no-warn-script-location' + + FileOpen $2 "$INSTDIR\python\python310._pth" a + FileSeek $2 0 END + FileWrite $2 "$\r$\nLib$\r$\n" + FileWrite $2 "$\r$\nLib\site-packages$\r$\n" + FileClose $2 - install_lemonade: - DetailPrint "-------------------------" - DetailPrint "- Lemonade Installation -" - DetailPrint "-------------------------" + DetailPrint "-------------------------" + DetailPrint "- Lemonade Installation -" + DetailPrint "-------------------------" - DetailPrint "- Installing $LEMONADE_SERVER_STRING..." - ${If} $HYBRID_SELECTED == "true" - nsExec::ExecToLog '"$INSTDIR\$LEMONADE_CONDA_ENV\python.exe" -m pip install -e "$INSTDIR"[llm-oga-hybrid] --no-warn-script-location' - ${Else} - nsExec::ExecToLog '"$INSTDIR\$LEMONADE_CONDA_ENV\python.exe" -m pip install -e "$INSTDIR"[llm] --no-warn-script-location' - ${EndIf} - Pop $R0 ; Return value - DetailPrint "- $LEMONADE_SERVER_STRING install return code: $R0" + DetailPrint "- Installing $LEMONADE_SERVER_STRING..." + ${If} $HYBRID_SELECTED == "true" + ExecWait '"$INSTDIR\python\python.exe" -m pip install "$INSTDIR"[llm-oga-hybrid] --no-warn-script-location' $8 + ${Else} + ExecWait '"$INSTDIR\python\python.exe" -m pip install "$INSTDIR"[llm] --no-warn-script-location' $8 + ${EndIf} + DetailPrint "- $LEMONADE_SERVER_STRING install return code: $8" - ; Check if installation was successful (exit code should be 0) - StrCmp $R0 0 install_success install_failed + ; Check if installation was successful (exit code should be 0) + StrCmp $8 0 install_success install_failed install_success: DetailPrint "- $LEMONADE_SERVER_STRING installation successful" @@ -233,7 +166,7 @@ Section "Install Ryzen AI Hybrid Execution" HybridSec ; Once we're done downloading and installing the archive the size comes out to about 370MB AddSize 388882 - nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --ryzenai hybrid -y' + nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --ryzenai hybrid -y' Pop $R0 ; Return value DetailPrint "Hybrid execution mode install return code: $R0" @@ -299,20 +232,19 @@ SubSection /e "Selected Models" ModelsSec ${GetParameters} $CMDLINE ${GetOptions} $CMDLINE "/Models=" $R0 ${If} $R0 != "" - nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models $R0' + nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --models $R0' ${Else} ; Otherwise, only the default CPU model will be installed - nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models Qwen2.5-0.5B-Instruct-CPU' + nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --models Qwen2.5-0.5B-Instruct-CPU' ${EndIf} ${Else} - nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models $9' + nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --models $9' ${EndIf} SectionEnd SubSectionEnd Section "-Add Desktop Shortcut" ShortcutSec - ; Create a desktop shortcut that passes the conda environment name as a parameter CreateShortcut "$DESKTOP\lemonade-server.lnk" "$INSTDIR\bin\lemonade-server.bat" "serve --keep-alive" "$INSTDIR\img\favicon.ico" SectionEnd @@ -458,7 +390,7 @@ LangString MUI_TEXT_ABORT_SUBTITLE "${LANG_ENGLISH}" "Installation has been abor LangString MUI_BUTTONTEXT_FINISH "${LANG_ENGLISH}" "Finish" LangString MUI_TEXT_LICENSE_TITLE ${LANG_ENGLISH} "AMD License Agreement" LangString MUI_TEXT_LICENSE_SUBTITLE ${LANG_ENGLISH} "Please review the license terms before installing AMD Ryzen AI Hybrid Execution Mode." -LangString DESC_SEC01 ${LANG_ENGLISH} "The minimum set of dependencies for a lemonade server that runs LLMs on CPU." +LangString DESC_SEC01 ${LANG_ENGLISH} "The minimum set of dependencies for a lemonade server that runs LLMs on CPU (includes Python)." LangString DESC_HybridSec ${LANG_ENGLISH} "Add support for running LLMs on Ryzen AI hybrid execution mode. Only available on Ryzen AI 300-series processors." LangString DESC_ModelsSec ${LANG_ENGLISH} "Select which models to install" LangString DESC_Qwen05Sec ${LANG_ENGLISH} "Small CPU-only Qwen model" @@ -485,7 +417,6 @@ LangString DESC_DeepSeekQwen7BSec ${LANG_ENGLISH} "7B parameter DeepSeek Qwen mo Function .onInit StrCpy $LEMONADE_SERVER_STRING "Lemonade Server" - StrCpy $LEMONADE_CONDA_ENV "lemon_env" StrCpy $HYBRID_SELECTED "true" ; Create a variable to store selected models diff --git a/installer/lemonade-server.bat b/installer/lemonade-server.bat index 06186805..80c3e502 100644 --- a/installer/lemonade-server.bat +++ b/installer/lemonade-server.bat @@ -1,6 +1,5 @@ @echo off setlocal enabledelayedexpansion -set CONDA_ENV=lemon_env REM --keep-alive is only used by the bash script to make sure that, if the server fails to open, we don't close the terminal right away. REM Check for --keep-alive argument and remove it from arguments passed to CLI @@ -18,7 +17,7 @@ REM Change to parent directory where conda env and bin folders are located pushd "%~dp0.." REM Run the Python CLI script through conda, passing filtered arguments -call conda run --no-capture-output -p "%CD%\%CONDA_ENV%" lemonade-server-dev !ARGS! +call "%CD%\python\Scripts\lemonade-server-dev" !ARGS! popd REM Error handling: Show message and pause if --keep-alive was specified diff --git a/installer/run_server.bat b/installer/run_server.bat deleted file mode 100644 index 6c6ffec1..00000000 --- a/installer/run_server.bat +++ /dev/null @@ -1,12 +0,0 @@ -@echo off -set CONDA_ENV=%1 -if "%CONDA_ENV%"=="" set CONDA_ENV=lemon_env -echo Starting Lemonade Server... -call conda run --no-capture-output -p "%~dp0%CONDA_ENV%" lemonade serve -if %ERRORLEVEL% neq 0 ( - echo. - echo An error occurred while running Lemonade Server. - echo Please check the error message above. - echo. - pause -) diff --git a/src/lemonade/cache.py b/src/lemonade/cache.py index b3e4cd84..c992865e 100644 --- a/src/lemonade/cache.py +++ b/src/lemonade/cache.py @@ -68,6 +68,7 @@ class Keys: DTYPE = "dtype" PROMPT = "prompt" PROMPT_TOKENS = "prompt_tokens" + PROMPT_TEMPLATE = "prompt_template" RESPONSE = "response" RESPONSE_TOKENS = "response_tokens" RESPONSE_LENGTHS_HISTOGRAM = "response_lengths_histogram" diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py index b067cd2f..b92dbf2d 100644 --- a/src/lemonade/tools/ort_genai/oga.py +++ b/src/lemonade/tools/ort_genai/oga.py @@ -330,6 +330,14 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: help="Download the model if needed, but don't load it", ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Set this flag to use models whose code is on the Hugging Face hub rather " + "than natively in the OnnxRuntime Gen AI libraries. Please review the model code " + "in advance as this is a security risk.", + ) + parser.add_argument( "--subfolder", default=None, @@ -547,15 +555,28 @@ def _setup_npu_environment(): return saved_state @staticmethod - def _load_model_and_setup_state(state, full_model_path, checkpoint): + def _load_model_and_setup_state( + state, full_model_path, checkpoint, trust_remote_code + ): """ Loads the OGA model from local folder and then loads the tokenizer. """ state.model = OrtGenaiModel(full_model_path) - hf_tokenizer = AutoTokenizer.from_pretrained( - full_model_path, local_files_only=True - ) + try: + hf_tokenizer = AutoTokenizer.from_pretrained( + full_model_path, + local_files_only=True, + trust_remote_code=trust_remote_code, + ) + except ValueError as e: + if "trust_remote_code" in str(e): + raise ValueError( + "This model requires you to execute code from the repo. Please review it " + "and if you trust it, then use the `--trust-remote-code` flag with oga-load." + ) + raise + state.tokenizer = OrtGenaiTokenizer( state.model.model, hf_tokenizer, @@ -582,6 +603,7 @@ def run( int4_block_size: int = None, force: bool = False, download_only: bool = False, + trust_remote_code=False, subfolder: str = None, ) -> State: state.device = device @@ -671,7 +693,9 @@ def run( "0" if "phi-" in checkpoint.lower() else "1" ) - self._load_model_and_setup_state(state, full_model_path, checkpoint) + self._load_model_and_setup_state( + state, full_model_path, checkpoint, trust_remote_code + ) finally: self._cleanup_environment(saved_env_state) diff --git a/src/lemonade/tools/prompt.py b/src/lemonade/tools/prompt.py index 28a44284..79095ff8 100644 --- a/src/lemonade/tools/prompt.py +++ b/src/lemonade/tools/prompt.py @@ -2,6 +2,7 @@ import os import matplotlib.pyplot as plt import turnkeyml.common.build as build +import turnkeyml.common.printing as printing from turnkeyml.state import State from turnkeyml.tools import Tool from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter @@ -60,6 +61,7 @@ def __init__(self): self.status_stats = [ Keys.PROMPT_TOKENS, Keys.PROMPT, + Keys.PROMPT_TEMPLATE, Keys.RESPONSE_TOKENS, Keys.RESPONSE, Keys.RESPONSE_LENGTHS_HISTOGRAM, @@ -75,12 +77,19 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--prompt", "-p", - help="Input prompt to the LLM. Two formats are supported. " - "1) str: use a user-provided prompt string " + help="Input prompt to the LLM. Two formats are supported: " + "1) str: use a user-provided prompt string, and " "2) path/to/prompt.txt: load the prompt from a .txt file.", required=True, ) + parser.add_argument( + "--template", + "-t", + action="store_true", + help="Insert the prompt into the model's chat template before processing.", + ) + parser.add_argument( "--max-new-tokens", "-m", @@ -113,9 +122,6 @@ def parse(self, state: State, args, known_only=True) -> argparse.Namespace: if parsed_args.prompt.endswith(".txt") and os.path.exists(parsed_args.prompt): with open(parsed_args.prompt, "r", encoding="utf-8") as f: parsed_args.prompt = f.read() - else: - # No change to the prompt - pass return parsed_args @@ -125,11 +131,28 @@ def run( prompt: str = "Hello", max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, n_trials: int = DEFAULT_N_TRIALS, + template: bool = False, ) -> State: model: ModelAdapter = state.model tokenizer: TokenizerAdapter = state.tokenizer + # If template flag is set, then wrap prompt in template + if template: + # Embed prompt in model's chat template + if tokenizer.chat_template: + # Use the model's built-in chat template if available + messages_dict = [{"role": "user", "content": prompt}] + prompt = tokenizer.apply_chat_template( + messages_dict, tokenize=False, add_generation_prompt=True + ) + state.save_stat(Keys.PROMPT_TEMPLATE, "Model-specific") + else: + # Fallback to a standardized template + printing.log_info("No chat template found. Using default template.") + prompt = f"<|user|>\n{prompt} <|end|>\n<|assistant|>" + state.save_stat(Keys.PROMPT_TEMPLATE, "Default") + input_ids = tokenizer(prompt, return_tensors="pt").input_ids if isinstance(input_ids, (list, str)): # OGA models return a list of tokens diff --git a/src/lemonade/tools/serve.py b/src/lemonade/tools/serve.py index 2f904bfc..d561ed58 100644 --- a/src/lemonade/tools/serve.py +++ b/src/lemonade/tools/serve.py @@ -22,11 +22,8 @@ from openai.types.chat.chat_completion_chunk import ChoiceDelta from openai.types.model import Model -from turnkeyml.state import State from turnkeyml.tools.management_tools import ManagementTool -from lemonade.tools.adapter import ModelAdapter -from lemonade.tools.huggingface_load import HuggingfaceLoad -from lemonade.cache import DEFAULT_CACHE_DIR +import lemonade.api as lemonade_api from lemonade_install.install import ModelManager # Set to a high number to allow for interesting experiences in real apps @@ -87,14 +84,16 @@ class LoadConfig(BaseModel): """ Configuration for loading a language model. - Specifies the model checkpoint, cache directory, generation parameters, - and hardware configuration for model loading. + Specifies the model checkpoint, generation parameters, + and hardware/framework configuration (recipe) for model loading. """ checkpoint: str - cache_dir: str = DEFAULT_CACHE_DIR max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS - device: str = "cpu" + recipe: str = "hf-cpu" + # Indicates the maximum prompt length allowed for that specific + # checkpoint + recipe combination + max_length: int = None class CompletionRequest(BaseModel): @@ -185,11 +184,11 @@ def __init__(self): # Flag that tells the LLM to stop generating text and end the response self.stop_event = Event() - self.llm_loaded = None + self.llm_loaded: LoadConfig = None self.tokenizer = None - # Placeholders for state and configs - self.state = None + # Placeholders for model and configs + self.model = None self.max_new_tokens = None # Initialize semaphore for tracking active generations @@ -242,7 +241,9 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: def run( self, - cache_dir: str = DEFAULT_CACHE_DIR, + # ManagementTool has a required cache_dir arg, but + # we always use the default cache directory + _=None, checkpoint: str = None, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, port: int = DEFAULT_PORT, @@ -286,7 +287,6 @@ def trace(message, *args, **kwargs): # Only load the model when starting the server if checkpoint was provided if checkpoint: config = LoadConfig( - cache_dir=cache_dir, checkpoint=checkpoint, max_new_tokens=max_new_tokens, ) @@ -369,7 +369,7 @@ async def generate(): completion = Completion( id="0", choices=[choice], - model=self.llm_loaded, + model=self.llm_loaded.checkpoint, object="text_completion", created=int(time.time()), ) @@ -406,7 +406,7 @@ async def generate(): return Completion( id="0", choices=[choice], - model=self.llm_loaded, + model=self.llm_loaded.checkpoint, object="text_completion", created=int(time.time()), ) @@ -475,7 +475,7 @@ async def generate(): id="0", object="chat.completion.chunk", created=int(time.time()), - model=self.llm_loaded, + model=self.llm_loaded.checkpoint, choices=[ Choice.model_construct( index=0, @@ -539,7 +539,7 @@ async def generate(): return ChatCompletion( id="0", choices=[choice], - model=self.llm_loaded, + model=self.llm_loaded.checkpoint, object="chat.completion", created=int(time.time()), ) @@ -549,8 +549,8 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No Core streaming completion logic, separated from response handling. Returns an async generator that yields tokens. """ - model = self.state.model # pylint: disable=no-member - tokenizer = self.state.tokenizer # pylint: disable=no-member + model = self.model + tokenizer = self.tokenizer # Reset the early-exit flag before we start each generation self.stop_event.clear() @@ -566,7 +566,7 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No stop_sequences = stop[:4] # Limit to 4 sequences as per spec # Set up the generation parameters - if isinstance(model, ModelAdapter) and model.type == "ort-genai": + if "oga-" in self.llm_loaded.recipe: from lemonade.tools.ort_genai.oga import OrtGenaiStreamer streamer = OrtGenaiStreamer(tokenizer) @@ -579,6 +579,16 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No ) self.input_tokens = len(input_ids[0]) + if ( + self.llm_loaded.max_length + and self.input_tokens > self.llm_loaded.max_length + ): + # This is the exact same exception message raised by OGA when max_length is exceeded + raise RuntimeError( + f"prompt tokens ({self.input_tokens}) cannot be greater " + f"than model context_length ({self.llm_loaded.max_length})" + ) + # Log the input tokens early to avoid this not showing due to potential crashes logging.debug(f"Input Tokens: {self.input_tokens}") logging.trace(f"Input Message: {message}") @@ -710,11 +720,7 @@ async def health(self): return { "status": "ok", - "model_loaded": ( - self.state.checkpoint # pylint: disable=no-member - if self.state - else None - ), + "model_loaded": (self.llm_loaded.checkpoint if self.llm_loaded else None), } async def load_llm(self, config: LoadConfig): @@ -725,7 +731,7 @@ async def load_llm(self, config: LoadConfig): for _ in range(self.max_concurrent_generations): await self._generate_semaphore.acquire() - if config.checkpoint == self.llm_loaded: + if self.llm_loaded and config.checkpoint == self.llm_loaded.checkpoint: return { "status": "success", "message": f"Model already loaded: {config.checkpoint}", @@ -738,33 +744,12 @@ async def load_llm(self, config: LoadConfig): self.max_new_tokens = config.max_new_tokens logging.info(f"Loading llm: {config.checkpoint}") try: - state = State( - cache_dir=config.cache_dir, - build_name="main", + self.model, self.tokenizer = lemonade_api.from_pretrained( + checkpoint=config.checkpoint, recipe=config.recipe ) - if config.device == "cpu": - huggingface_loader = HuggingfaceLoad() - self.state = huggingface_loader.run( - state, - input=config.checkpoint, - device=config.device, - dtype=torch.bfloat16, - ) - else: - from lemonade.tools.ort_genai.oga import OgaLoad - - oga_loader = OgaLoad() - self.state = oga_loader.run( - state, - input=config.checkpoint, - device=config.device, - dtype="int4", - force=False, - ) self.max_new_tokens = config.max_new_tokens - self.llm_loaded = config.checkpoint - self.tokenizer = self.state.tokenizer # pylint: disable=no-member + self.llm_loaded = config return { "status": "success", @@ -772,7 +757,8 @@ async def load_llm(self, config: LoadConfig): } except Exception: # pylint: disable=broad-exception-caught self.llm_loaded = None - self.state = None + self.tokenizer = None + self.model = None logging.exception(f"Tried to load LLM {config.checkpoint} and failed") raise HTTPException( @@ -797,7 +783,8 @@ async def unload_llm(self, require_lock: bool = True): await self._generate_semaphore.acquire() self.llm_loaded = None - self.state = None + self.tokenizer = None + self.model = None return {"status": "success", "message": "Unloaded model"} except Exception as e: # pylint: disable=broad-exception-caught return { diff --git a/src/lemonade_install/install.py b/src/lemonade_install/install.py index b1e01921..1983bf5a 100644 --- a/src/lemonade_install/install.py +++ b/src/lemonade_install/install.py @@ -107,7 +107,7 @@ def supported_cpu_models(self) -> dict: return { "Qwen2.5-0.5B-Instruct-CPU": { "checkpoint": "Qwen/Qwen2.5-0.5B-Instruct", - "device": "cpu", + "recipe": "hf-cpu", "reasoning": False, } } @@ -121,33 +121,39 @@ def supported_hybrid_models(self) -> dict: return { "Llama-3.2-1B-Instruct-Hybrid": { "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", - "device": "hybrid", + "recipe": "oga-hybrid", "reasoning": False, + "max_length": 3000, }, "Llama-3.2-3B-Instruct-Hybrid": { "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", - "device": "hybrid", + "recipe": "oga-hybrid", "reasoning": False, + "max_length": 2000, }, "Phi-3-Mini-Instruct-Hybrid": { "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid", - "device": "hybrid", + "recipe": "oga-hybrid", "reasoning": False, + "max_length": 2000, }, "Qwen-1.5-7B-Chat-Hybrid": { "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid", - "device": "hybrid", + "recipe": "oga-hybrid", "reasoning": False, + "max_length": 3000, }, "DeepSeek-R1-Distill-Llama-8B-Hybrid": { "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid", - "device": "hybrid", + "recipe": "oga-hybrid", "reasoning": True, + "max_length": 2000, }, "DeepSeek-R1-Distill-Qwen-7B-Hybrid": { "checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid", - "device": "hybrid", + "recipe": "oga-hybrid", "reasoning": True, + "max_length": 2000, }, } diff --git a/src/lemonade_server/cli.py b/src/lemonade_server/cli.py index 667d48b2..59e33cbe 100644 --- a/src/lemonade_server/cli.py +++ b/src/lemonade_server/cli.py @@ -18,7 +18,7 @@ def serve(args): "Please stop the existing server before starting a new instance." ), ) - return + sys.exit(1) # Otherwise, start the server print("Starting Lemonade Server...") diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py index 86fe47d5..5d975072 100644 --- a/src/turnkeyml/version.py +++ b/src/turnkeyml/version.py @@ -1 +1 @@ -__version__ = "6.1.3" +__version__ = "6.1.4"