diff --git a/docs/lemonade/README.md b/docs/lemonade/README.md
index d5afff83..92e0a28d 100644
--- a/docs/lemonade/README.md
+++ b/docs/lemonade/README.md
@@ -83,12 +83,12 @@ To prompt your LLM, try one of the following:
 
 OGA iGPU:
 ```bash
-    lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"
+    lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are" -t
 ```
 
 Hugging Face:
 ```bash
-    lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are"
+    lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are" -t
 ```
 
 The LLM will run with your provided prompt, and the LLM's response to your prompt will be printed to the screen. You can replace the `"Hello, my thoughts are"` with any prompt you like.
@@ -97,6 +97,9 @@ You can also replace the `facebook/opt-125m` with any Hugging Face checkpoint yo
 
 You can also set the `--device` argument in `oga-load` and `huggingface-load` to load your LLM on a different device.
 
+The `-t` (or `--template`) flag instructs lemonade to insert the prompt string into the model's chat template.
+This typically results in the model returning a higher quality response.
+
 Run `lemonade huggingface-load -h` and `lemonade llm-prompt -h` to learn more about these tools.
 
 ## Accuracy
diff --git a/examples/lemonade/server/continue.md b/examples/lemonade/server/continue.md
index 176b674d..1ff9450a 100644
--- a/examples/lemonade/server/continue.md
+++ b/examples/lemonade/server/continue.md
@@ -31,22 +31,23 @@ This will add a Continue tab to your VS Code Activity Bar.
 > Note: The following instructions are based on instructions from Continue found [here](https://docs.continue.dev/customize/model-providers/openai#openai-compatible-servers--apis) 
 
 1. Open the Continue tab in your VS Code Activity Bar.
-1. Click the gear icon at the top to open Settings.
-1. Under "Configuration", click "Open Config File".
-1. Replace the "models" key in the `config.json` with the following and save:
-
-```json
-  "models": [
-    {
-      "title": "Lemonade", 
-      "provider": "openai",
-      "model": "Qwen-1.5-7B-Chat-Hybrid",
-      "apiKey": "-",
-      "apiBase": "http://localhost:8000/api/v0"
-    }
-  ],
+1. Click the chat box. Some buttons will appear at the bottom of the box, including `Select model`.
+1. Click `Select model`, then `+ Add Chat model` to open the new model dialog box.
+1. Click the `config file` link at the very bottom of the dialog to open `config.yaml`.
+1. Replace the "models" key in the `config.yaml` with the following and save:
+
+```yaml
+models:
+  - name: Lemonade
+    provider: openai
+    model: Qwen-1.5-7B-Chat-Hybrid 
+    apiBase: http://localhost:8000/api/v0
+    apiKey: none
 ```
 
+6. Close the dialog box.
+7. Click the chat box again. You should see `Lemonade` where you used to see `Select model`. Ready!
+
 ## Usage
 
 > Note: see the Continue [user guide](https://docs.continue.dev/) to learn about all of their features.
diff --git a/installer/Installer.nsi b/installer/Installer.nsi
index 575c7130..02b182de 100644
--- a/installer/Installer.nsi
+++ b/installer/Installer.nsi
@@ -16,7 +16,6 @@ OutFile "Lemonade_Server_Installer.exe"
 Var LogHandle
 
 Var LEMONADE_SERVER_STRING
-Var LEMONADE_CONDA_ENV
 Var HYBRID_SELECTED
 Var HYBRID_CLI_OPTION
 
@@ -54,9 +53,6 @@ SectionIn RO ; Read only, always installed
 
   remove_dir:
     ; Try to remove directory and verify it was successful
-
-    ; Attempt conda remove of the env, to help speed things up
-    ExecWait 'conda env remove -yp "$INSTDIR\$LEMONADE_CONDA_ENV"'
     
     ; Delete all remaining files
     RMDir /r "$INSTDIR"
@@ -103,97 +99,34 @@ SectionIn RO ; Read only, always installed
 
     DetailPrint "- Packaged repo"
 
-    ; Check if conda is available
-    ExecWait 'where conda' $2
-    DetailPrint "- Checked if conda is available"
-
-    ; If conda is not found, show a message
-    ; Otherwise, continue with the installation
-    StrCmp $2 "0" create_env conda_not_available
-
-    conda_not_available:
-      DetailPrint "- Conda not installed."
-      ${IfNot} ${Silent}
-        MessageBox MB_YESNO "Conda is not installed. Would you like to install Miniconda?" IDYES install_miniconda IDNO exit_installer
-      ${Else}
-        Goto install_miniconda
-      ${EndIf}
-
-    exit_installer:
-      DetailPrint "- Something went wrong. Exiting installer"
-      Quit
-
-    install_miniconda:
-      DetailPrint "-------------"
-      DetailPrint "- Miniconda -"
-      DetailPrint "-------------"
-      DetailPrint "- Downloading Miniconda installer..."
-      ExecWait 'curl -s -o "$TEMP\Miniconda3-latest-Windows-x86_64.exe" "https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe"'
-
-      ; Install Miniconda silently
-      ExecWait '"$TEMP\Miniconda3-latest-Windows-x86_64.exe" /InstallationType=JustMe /AddToPath=1 /RegisterPython=0 /S /D=$PROFILE\miniconda3' $2
-      ; Check if Miniconda installation was successful
-      ${If} $2 == 0
-        DetailPrint "- Miniconda installation successful"
-        ${IfNot} ${Silent}
-          MessageBox MB_OK "Miniconda has been successfully installed."
-        ${EndIf}
-
-        StrCpy $R1 "$PROFILE\miniconda3\Scripts\conda.exe"
-        Goto create_env
-
-      ${Else}
-        DetailPrint "- Miniconda installation failed"
-        ${IfNot} ${Silent}
-          MessageBox MB_OK "Error: Miniconda installation failed. Installation will be aborted."
-        ${EndIf}
-        Goto exit_installer
-      ${EndIf}
-
-    create_env:
-      DetailPrint "---------------------"
-      DetailPrint "- Conda Environment -"
-      DetailPrint "---------------------"
-
-      DetailPrint "- Initializing conda..."
-      ; Use the appropriate conda executable
-      ${If} $R1 == ""
-        StrCpy $R1 "conda"
-      ${EndIf}
-      ; Initialize conda (needed for systems where conda was previously installed but not initialized)
-      nsExec::ExecToStack '"$R1" init'
-
-      DetailPrint "- Creating a Python 3.10 environment named '$LEMONADE_CONDA_ENV' in the installation directory: $INSTDIR..."
-      ExecWait '"$R1" create -p "$INSTDIR\$LEMONADE_CONDA_ENV" python=3.10 -y' $R0
-
-      ; Check if the environment creation was successful (exit code should be 0)
-      StrCmp $R0 0 install_lemonade env_creation_failed
-
-    env_creation_failed:
-      DetailPrint "- ERROR: Environment creation failed"
-      ; Display an error message and exit
-      ${IfNot} ${Silent}
-        MessageBox MB_OK "ERROR: Failed to create the Python environment. Installation will be aborted."
-      ${EndIf}
-      Quit
+    DetailPrint "Set up Python"
+    CreateDirectory "$INSTDIR\python"
+    ExecWait 'curl -s -o "$INSTDIR\python\python.zip" "https://www.python.org/ftp/python/3.10.9/python-3.10.9-embed-amd64.zip"'
+    ExecWait 'tar -xf "$INSTDIR\python\python.zip" -C "$INSTDIR\python"'
+    ExecWait 'curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py'
+    ExecWait '$INSTDIR\python\python.exe get-pip.py --no-warn-script-location'
+    
+    FileOpen $2 "$INSTDIR\python\python310._pth" a
+    FileSeek $2 0 END
+    FileWrite $2 "$\r$\nLib$\r$\n"
+    FileWrite $2 "$\r$\nLib\site-packages$\r$\n"
+    FileClose $2
 
-    install_lemonade:
-      DetailPrint "-------------------------"
-      DetailPrint "- Lemonade Installation -"
-      DetailPrint "-------------------------"
+    DetailPrint "-------------------------"
+    DetailPrint "- Lemonade Installation -"
+    DetailPrint "-------------------------"
 
 
-      DetailPrint "- Installing $LEMONADE_SERVER_STRING..."
-      ${If} $HYBRID_SELECTED == "true"
-        nsExec::ExecToLog '"$INSTDIR\$LEMONADE_CONDA_ENV\python.exe" -m pip install -e "$INSTDIR"[llm-oga-hybrid] --no-warn-script-location'
-      ${Else}
-        nsExec::ExecToLog '"$INSTDIR\$LEMONADE_CONDA_ENV\python.exe" -m pip install -e "$INSTDIR"[llm] --no-warn-script-location'
-      ${EndIf}
-      Pop $R0  ; Return value
-      DetailPrint "- $LEMONADE_SERVER_STRING install return code: $R0"
+    DetailPrint "- Installing $LEMONADE_SERVER_STRING..."
+    ${If} $HYBRID_SELECTED == "true"
+      ExecWait '"$INSTDIR\python\python.exe" -m pip install "$INSTDIR"[llm-oga-hybrid] --no-warn-script-location' $8
+    ${Else}
+      ExecWait '"$INSTDIR\python\python.exe" -m pip install "$INSTDIR"[llm] --no-warn-script-location' $8
+    ${EndIf}
+    DetailPrint "- $LEMONADE_SERVER_STRING install return code: $8"
 
-      ; Check if installation was successful (exit code should be 0)
-      StrCmp $R0 0 install_success install_failed
+    ; Check if installation was successful (exit code should be 0)
+    StrCmp $8 0 install_success install_failed
 
     install_success:
       DetailPrint "- $LEMONADE_SERVER_STRING installation successful"
@@ -233,7 +166,7 @@ Section "Install Ryzen AI Hybrid Execution" HybridSec
   ; Once we're done downloading and installing the archive the size comes out to about 370MB
   AddSize 388882
 
-  nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --ryzenai hybrid -y'
+  nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --ryzenai hybrid -y'
 
   Pop $R0  ; Return value
   DetailPrint "Hybrid execution mode install return code: $R0"
@@ -299,20 +232,19 @@ SubSection /e "Selected Models" ModelsSec
         ${GetParameters} $CMDLINE
         ${GetOptions} $CMDLINE "/Models=" $R0
         ${If} $R0 != ""
-            nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models $R0'
+            nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --models $R0'
         ${Else}
             ; Otherwise, only the default CPU model will be installed
-            nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models Qwen2.5-0.5B-Instruct-CPU'
+            nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --models Qwen2.5-0.5B-Instruct-CPU'
         ${EndIf}
     ${Else}
-        nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models $9'
+        nsExec::ExecToLog '$INSTDIR\python\Scripts\lemonade-install --models $9'
     ${EndIf}
   SectionEnd
 
 SubSectionEnd
 
 Section "-Add Desktop Shortcut" ShortcutSec  
-  ; Create a desktop shortcut that passes the conda environment name as a parameter
   CreateShortcut "$DESKTOP\lemonade-server.lnk" "$INSTDIR\bin\lemonade-server.bat" "serve --keep-alive" "$INSTDIR\img\favicon.ico"
 
 SectionEnd
@@ -458,7 +390,7 @@ LangString MUI_TEXT_ABORT_SUBTITLE "${LANG_ENGLISH}" "Installation has been abor
 LangString MUI_BUTTONTEXT_FINISH "${LANG_ENGLISH}" "Finish"
 LangString MUI_TEXT_LICENSE_TITLE ${LANG_ENGLISH} "AMD License Agreement"
 LangString MUI_TEXT_LICENSE_SUBTITLE ${LANG_ENGLISH} "Please review the license terms before installing AMD Ryzen AI Hybrid Execution Mode."
-LangString DESC_SEC01 ${LANG_ENGLISH} "The minimum set of dependencies for a lemonade server that runs LLMs on CPU."
+LangString DESC_SEC01 ${LANG_ENGLISH} "The minimum set of dependencies for a lemonade server that runs LLMs on CPU (includes Python)."
 LangString DESC_HybridSec ${LANG_ENGLISH} "Add support for running LLMs on Ryzen AI hybrid execution mode. Only available on Ryzen AI 300-series processors."
 LangString DESC_ModelsSec ${LANG_ENGLISH} "Select which models to install"
 LangString DESC_Qwen05Sec ${LANG_ENGLISH} "Small CPU-only Qwen model"
@@ -485,7 +417,6 @@ LangString DESC_DeepSeekQwen7BSec ${LANG_ENGLISH} "7B parameter DeepSeek Qwen mo
 
 Function .onInit
   StrCpy $LEMONADE_SERVER_STRING "Lemonade Server"
-  StrCpy $LEMONADE_CONDA_ENV "lemon_env"
   StrCpy $HYBRID_SELECTED "true"
   
   ; Create a variable to store selected models
diff --git a/installer/lemonade-server.bat b/installer/lemonade-server.bat
index 06186805..80c3e502 100644
--- a/installer/lemonade-server.bat
+++ b/installer/lemonade-server.bat
@@ -1,6 +1,5 @@
 @echo off
 setlocal enabledelayedexpansion
-set CONDA_ENV=lemon_env
 
 REM --keep-alive is only used by the bash script to make sure that, if the server fails to open, we don't close the terminal right away.
 REM Check for --keep-alive argument and remove it from arguments passed to CLI
@@ -18,7 +17,7 @@ REM Change to parent directory where conda env and bin folders are located
 pushd "%~dp0.."
 
 REM Run the Python CLI script through conda, passing filtered arguments
-call conda run --no-capture-output -p "%CD%\%CONDA_ENV%" lemonade-server-dev !ARGS!
+call "%CD%\python\Scripts\lemonade-server-dev" !ARGS!
 popd
 
 REM Error handling: Show message and pause if --keep-alive was specified
diff --git a/installer/run_server.bat b/installer/run_server.bat
deleted file mode 100644
index 6c6ffec1..00000000
--- a/installer/run_server.bat
+++ /dev/null
@@ -1,12 +0,0 @@
-@echo off
-set CONDA_ENV=%1
-if "%CONDA_ENV%"=="" set CONDA_ENV=lemon_env
-echo Starting Lemonade Server...
-call conda run --no-capture-output -p "%~dp0%CONDA_ENV%" lemonade serve
-if %ERRORLEVEL% neq 0 (
-  echo.
-  echo An error occurred while running Lemonade Server.
-  echo Please check the error message above.
-  echo.
-  pause
-)
diff --git a/src/lemonade/cache.py b/src/lemonade/cache.py
index b3e4cd84..c992865e 100644
--- a/src/lemonade/cache.py
+++ b/src/lemonade/cache.py
@@ -68,6 +68,7 @@ class Keys:
     DTYPE = "dtype"
     PROMPT = "prompt"
     PROMPT_TOKENS = "prompt_tokens"
+    PROMPT_TEMPLATE = "prompt_template"
     RESPONSE = "response"
     RESPONSE_TOKENS = "response_tokens"
     RESPONSE_LENGTHS_HISTOGRAM = "response_lengths_histogram"
diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py
index b067cd2f..b92dbf2d 100644
--- a/src/lemonade/tools/ort_genai/oga.py
+++ b/src/lemonade/tools/ort_genai/oga.py
@@ -330,6 +330,14 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
             help="Download the model if needed, but don't load it",
         )
 
+        parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Set this flag to use models whose code is on the Hugging Face hub rather "
+            "than natively in the OnnxRuntime Gen AI libraries.  Please review the model code "
+            "in advance as this is a security risk.",
+        )
+
         parser.add_argument(
             "--subfolder",
             default=None,
@@ -547,15 +555,28 @@ def _setup_npu_environment():
         return saved_state
 
     @staticmethod
-    def _load_model_and_setup_state(state, full_model_path, checkpoint):
+    def _load_model_and_setup_state(
+        state, full_model_path, checkpoint, trust_remote_code
+    ):
         """
         Loads the OGA model from local folder and then loads the tokenizer.
         """
         state.model = OrtGenaiModel(full_model_path)
 
-        hf_tokenizer = AutoTokenizer.from_pretrained(
-            full_model_path, local_files_only=True
-        )
+        try:
+            hf_tokenizer = AutoTokenizer.from_pretrained(
+                full_model_path,
+                local_files_only=True,
+                trust_remote_code=trust_remote_code,
+            )
+        except ValueError as e:
+            if "trust_remote_code" in str(e):
+                raise ValueError(
+                    "This model requires you to execute code from the repo.  Please review it "
+                    "and if you trust it, then use the `--trust-remote-code` flag with oga-load."
+                )
+            raise
+
         state.tokenizer = OrtGenaiTokenizer(
             state.model.model,
             hf_tokenizer,
@@ -582,6 +603,7 @@ def run(
         int4_block_size: int = None,
         force: bool = False,
         download_only: bool = False,
+        trust_remote_code=False,
         subfolder: str = None,
     ) -> State:
         state.device = device
@@ -671,7 +693,9 @@ def run(
                         "0" if "phi-" in checkpoint.lower() else "1"
                     )
 
-                self._load_model_and_setup_state(state, full_model_path, checkpoint)
+                self._load_model_and_setup_state(
+                    state, full_model_path, checkpoint, trust_remote_code
+                )
             finally:
                 self._cleanup_environment(saved_env_state)
 
diff --git a/src/lemonade/tools/prompt.py b/src/lemonade/tools/prompt.py
index 28a44284..79095ff8 100644
--- a/src/lemonade/tools/prompt.py
+++ b/src/lemonade/tools/prompt.py
@@ -2,6 +2,7 @@
 import os
 import matplotlib.pyplot as plt
 import turnkeyml.common.build as build
+import turnkeyml.common.printing as printing
 from turnkeyml.state import State
 from turnkeyml.tools import Tool
 from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
@@ -60,6 +61,7 @@ def __init__(self):
         self.status_stats = [
             Keys.PROMPT_TOKENS,
             Keys.PROMPT,
+            Keys.PROMPT_TEMPLATE,
             Keys.RESPONSE_TOKENS,
             Keys.RESPONSE,
             Keys.RESPONSE_LENGTHS_HISTOGRAM,
@@ -75,12 +77,19 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser.add_argument(
             "--prompt",
             "-p",
-            help="Input prompt to the LLM. Two formats are supported. "
-            "1) str: use a user-provided prompt string "
+            help="Input prompt to the LLM. Two formats are supported: "
+            "1) str: use a user-provided prompt string, and "
             "2) path/to/prompt.txt: load the prompt from a .txt file.",
             required=True,
         )
 
+        parser.add_argument(
+            "--template",
+            "-t",
+            action="store_true",
+            help="Insert the prompt into the model's chat template before processing.",
+        )
+
         parser.add_argument(
             "--max-new-tokens",
             "-m",
@@ -113,9 +122,6 @@ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
         if parsed_args.prompt.endswith(".txt") and os.path.exists(parsed_args.prompt):
             with open(parsed_args.prompt, "r", encoding="utf-8") as f:
                 parsed_args.prompt = f.read()
-        else:
-            # No change to the prompt
-            pass
 
         return parsed_args
 
@@ -125,11 +131,28 @@ def run(
         prompt: str = "Hello",
         max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
         n_trials: int = DEFAULT_N_TRIALS,
+        template: bool = False,
     ) -> State:
 
         model: ModelAdapter = state.model
         tokenizer: TokenizerAdapter = state.tokenizer
 
+        # If template flag is set, then wrap prompt in template
+        if template:
+            # Embed prompt in model's chat template
+            if tokenizer.chat_template:
+                # Use the model's built-in chat template if available
+                messages_dict = [{"role": "user", "content": prompt}]
+                prompt = tokenizer.apply_chat_template(
+                    messages_dict, tokenize=False, add_generation_prompt=True
+                )
+                state.save_stat(Keys.PROMPT_TEMPLATE, "Model-specific")
+            else:
+                # Fallback to a standardized template
+                printing.log_info("No chat template found. Using default template.")
+                prompt = f"<|user|>\n{prompt} <|end|>\n<|assistant|>"
+                state.save_stat(Keys.PROMPT_TEMPLATE, "Default")
+
         input_ids = tokenizer(prompt, return_tensors="pt").input_ids
         if isinstance(input_ids, (list, str)):
             # OGA models return a list of tokens
diff --git a/src/lemonade/tools/serve.py b/src/lemonade/tools/serve.py
index 2f904bfc..d561ed58 100644
--- a/src/lemonade/tools/serve.py
+++ b/src/lemonade/tools/serve.py
@@ -22,11 +22,8 @@
 from openai.types.chat.chat_completion_chunk import ChoiceDelta
 from openai.types.model import Model
 
-from turnkeyml.state import State
 from turnkeyml.tools.management_tools import ManagementTool
-from lemonade.tools.adapter import ModelAdapter
-from lemonade.tools.huggingface_load import HuggingfaceLoad
-from lemonade.cache import DEFAULT_CACHE_DIR
+import lemonade.api as lemonade_api
 from lemonade_install.install import ModelManager
 
 # Set to a high number to allow for interesting experiences in real apps
@@ -87,14 +84,16 @@ class LoadConfig(BaseModel):
     """
     Configuration for loading a language model.
 
-    Specifies the model checkpoint, cache directory, generation parameters,
-    and hardware configuration for model loading.
+    Specifies the model checkpoint, generation parameters,
+    and hardware/framework configuration (recipe) for model loading.
     """
 
     checkpoint: str
-    cache_dir: str = DEFAULT_CACHE_DIR
     max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
-    device: str = "cpu"
+    recipe: str = "hf-cpu"
+    # Indicates the maximum prompt length allowed for that specific
+    # checkpoint + recipe combination
+    max_length: int = None
 
 
 class CompletionRequest(BaseModel):
@@ -185,11 +184,11 @@ def __init__(self):
         # Flag that tells the LLM to stop generating text and end the response
         self.stop_event = Event()
 
-        self.llm_loaded = None
+        self.llm_loaded: LoadConfig = None
         self.tokenizer = None
 
-        # Placeholders for state and configs
-        self.state = None
+        # Placeholders for model and configs
+        self.model = None
         self.max_new_tokens = None
 
         # Initialize semaphore for tracking active generations
@@ -242,7 +241,9 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
 
     def run(
         self,
-        cache_dir: str = DEFAULT_CACHE_DIR,
+        # ManagementTool has a required cache_dir arg, but
+        # we always use the default cache directory
+        _=None,
         checkpoint: str = None,
         max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
         port: int = DEFAULT_PORT,
@@ -286,7 +287,6 @@ def trace(message, *args, **kwargs):
         # Only load the model when starting the server if checkpoint was provided
         if checkpoint:
             config = LoadConfig(
-                cache_dir=cache_dir,
                 checkpoint=checkpoint,
                 max_new_tokens=max_new_tokens,
             )
@@ -369,7 +369,7 @@ async def generate():
                     completion = Completion(
                         id="0",
                         choices=[choice],
-                        model=self.llm_loaded,
+                        model=self.llm_loaded.checkpoint,
                         object="text_completion",
                         created=int(time.time()),
                     )
@@ -406,7 +406,7 @@ async def generate():
             return Completion(
                 id="0",
                 choices=[choice],
-                model=self.llm_loaded,
+                model=self.llm_loaded.checkpoint,
                 object="text_completion",
                 created=int(time.time()),
             )
@@ -475,7 +475,7 @@ async def generate():
                         id="0",
                         object="chat.completion.chunk",
                         created=int(time.time()),
-                        model=self.llm_loaded,
+                        model=self.llm_loaded.checkpoint,
                         choices=[
                             Choice.model_construct(
                                 index=0,
@@ -539,7 +539,7 @@ async def generate():
             return ChatCompletion(
                 id="0",
                 choices=[choice],
-                model=self.llm_loaded,
+                model=self.llm_loaded.checkpoint,
                 object="chat.completion",
                 created=int(time.time()),
             )
@@ -549,8 +549,8 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No
         Core streaming completion logic, separated from response handling.
         Returns an async generator that yields tokens.
         """
-        model = self.state.model  # pylint: disable=no-member
-        tokenizer = self.state.tokenizer  # pylint: disable=no-member
+        model = self.model
+        tokenizer = self.tokenizer
 
         # Reset the early-exit flag before we start each generation
         self.stop_event.clear()
@@ -566,7 +566,7 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No
                 stop_sequences = stop[:4]  # Limit to 4 sequences as per spec
 
         # Set up the generation parameters
-        if isinstance(model, ModelAdapter) and model.type == "ort-genai":
+        if "oga-" in self.llm_loaded.recipe:
             from lemonade.tools.ort_genai.oga import OrtGenaiStreamer
 
             streamer = OrtGenaiStreamer(tokenizer)
@@ -579,6 +579,16 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No
             )
             self.input_tokens = len(input_ids[0])
 
+        if (
+            self.llm_loaded.max_length
+            and self.input_tokens > self.llm_loaded.max_length
+        ):
+            # This is the exact same exception message raised by OGA when max_length is exceeded
+            raise RuntimeError(
+                f"prompt tokens ({self.input_tokens}) cannot be greater "
+                f"than model context_length ({self.llm_loaded.max_length})"
+            )
+
         # Log the input tokens early to avoid this not showing due to potential crashes
         logging.debug(f"Input Tokens: {self.input_tokens}")
         logging.trace(f"Input Message: {message}")
@@ -710,11 +720,7 @@ async def health(self):
 
         return {
             "status": "ok",
-            "model_loaded": (
-                self.state.checkpoint  # pylint: disable=no-member
-                if self.state
-                else None
-            ),
+            "model_loaded": (self.llm_loaded.checkpoint if self.llm_loaded else None),
         }
 
     async def load_llm(self, config: LoadConfig):
@@ -725,7 +731,7 @@ async def load_llm(self, config: LoadConfig):
             for _ in range(self.max_concurrent_generations):
                 await self._generate_semaphore.acquire()
 
-            if config.checkpoint == self.llm_loaded:
+            if self.llm_loaded and config.checkpoint == self.llm_loaded.checkpoint:
                 return {
                     "status": "success",
                     "message": f"Model already loaded: {config.checkpoint}",
@@ -738,33 +744,12 @@ async def load_llm(self, config: LoadConfig):
             self.max_new_tokens = config.max_new_tokens
             logging.info(f"Loading llm: {config.checkpoint}")
             try:
-                state = State(
-                    cache_dir=config.cache_dir,
-                    build_name="main",
+                self.model, self.tokenizer = lemonade_api.from_pretrained(
+                    checkpoint=config.checkpoint, recipe=config.recipe
                 )
 
-                if config.device == "cpu":
-                    huggingface_loader = HuggingfaceLoad()
-                    self.state = huggingface_loader.run(
-                        state,
-                        input=config.checkpoint,
-                        device=config.device,
-                        dtype=torch.bfloat16,
-                    )
-                else:
-                    from lemonade.tools.ort_genai.oga import OgaLoad
-
-                    oga_loader = OgaLoad()
-                    self.state = oga_loader.run(
-                        state,
-                        input=config.checkpoint,
-                        device=config.device,
-                        dtype="int4",
-                        force=False,
-                    )
                 self.max_new_tokens = config.max_new_tokens
-                self.llm_loaded = config.checkpoint
-                self.tokenizer = self.state.tokenizer  # pylint: disable=no-member
+                self.llm_loaded = config
 
                 return {
                     "status": "success",
@@ -772,7 +757,8 @@ async def load_llm(self, config: LoadConfig):
                 }
             except Exception:  # pylint: disable=broad-exception-caught
                 self.llm_loaded = None
-                self.state = None
+                self.tokenizer = None
+                self.model = None
                 logging.exception(f"Tried to load LLM {config.checkpoint} and failed")
 
                 raise HTTPException(
@@ -797,7 +783,8 @@ async def unload_llm(self, require_lock: bool = True):
                     await self._generate_semaphore.acquire()
 
             self.llm_loaded = None
-            self.state = None
+            self.tokenizer = None
+            self.model = None
             return {"status": "success", "message": "Unloaded model"}
         except Exception as e:  # pylint: disable=broad-exception-caught
             return {
diff --git a/src/lemonade_install/install.py b/src/lemonade_install/install.py
index b1e01921..1983bf5a 100644
--- a/src/lemonade_install/install.py
+++ b/src/lemonade_install/install.py
@@ -107,7 +107,7 @@ def supported_cpu_models(self) -> dict:
         return {
             "Qwen2.5-0.5B-Instruct-CPU": {
                 "checkpoint": "Qwen/Qwen2.5-0.5B-Instruct",
-                "device": "cpu",
+                "recipe": "hf-cpu",
                 "reasoning": False,
             }
         }
@@ -121,33 +121,39 @@ def supported_hybrid_models(self) -> dict:
         return {
             "Llama-3.2-1B-Instruct-Hybrid": {
                 "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
+                "recipe": "oga-hybrid",
                 "reasoning": False,
+                "max_length": 3000,
             },
             "Llama-3.2-3B-Instruct-Hybrid": {
                 "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
+                "recipe": "oga-hybrid",
                 "reasoning": False,
+                "max_length": 2000,
             },
             "Phi-3-Mini-Instruct-Hybrid": {
                 "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
+                "recipe": "oga-hybrid",
                 "reasoning": False,
+                "max_length": 2000,
             },
             "Qwen-1.5-7B-Chat-Hybrid": {
                 "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
+                "recipe": "oga-hybrid",
                 "reasoning": False,
+                "max_length": 3000,
             },
             "DeepSeek-R1-Distill-Llama-8B-Hybrid": {
                 "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
-                "device": "hybrid",
+                "recipe": "oga-hybrid",
                 "reasoning": True,
+                "max_length": 2000,
             },
             "DeepSeek-R1-Distill-Qwen-7B-Hybrid": {
                 "checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
-                "device": "hybrid",
+                "recipe": "oga-hybrid",
                 "reasoning": True,
+                "max_length": 2000,
             },
         }
 
diff --git a/src/lemonade_server/cli.py b/src/lemonade_server/cli.py
index 667d48b2..59e33cbe 100644
--- a/src/lemonade_server/cli.py
+++ b/src/lemonade_server/cli.py
@@ -18,7 +18,7 @@ def serve(args):
                 "Please stop the existing server before starting a new instance."
             ),
         )
-        return
+        sys.exit(1)
 
     # Otherwise, start the server
     print("Starting Lemonade Server...")
diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py
index 86fe47d5..5d975072 100644
--- a/src/turnkeyml/version.py
+++ b/src/turnkeyml/version.py
@@ -1 +1 @@
-__version__ = "6.1.3"
+__version__ = "6.1.4"