ahmedshahriar · ahmedshahriar · Nov 7, 2025 · Nov 7, 2025 · Copilot · Nov 7, 2025
diff --git a/.env.example b/.env.example
@@ -1,6 +1,7 @@
 OPENAI_MODEL_ID=gpt-4.1-nano
 OPENAI_API_KEY=<str>
 OLLAMA_MODEL_ID=llama3.2:3b#replace with your model
+# OLLAMA_API_URL=http://localhost:11434
 
 # Huggingface API Config
 HUGGINGFACE_ACCESS_TOKEN=<str>
@@ -18,6 +19,11 @@ DATABASE_NAME=ghost_db
 QDRANT_DATABASE_HOST=localhost
 QDRANT_DATABASE_PORT=6333
 
+# Qdrant Cloud Config
+USE_QDRANT_CLOUD=true
+QDRANT_CLOUD_URL=your_qdrant_cloud_url
+QDRANT_APIKEY=your_qdrant_api_key
+
 # uvicorn Config
 UVICORN_HOST=0.0.0.0
 UVICORN_PORT=80000

diff --git a/.gitignore b/.gitignore
@@ -228,4 +228,10 @@ marimo/_lsp/
 __marimo__/
 
 # Local
-backup
+assets/
+backup/
+configs/feature_engineering.yaml
+configs/digital_data_etl_huyen-chip.yaml
+configs/digital_data_etl_jay_alammar.yaml
+output/
+tests/payloads/payload.json
diff --git a/configs/digital_data_etl_author_name.yaml.example b/configs/digital_data_etl_author_name.yaml.example
@@ -0,0 +1,10 @@
+parameters:
+  user_full_name: John Doe # [First Name(s)] [Last Name]
+  links:
+    # blog Posts
+    - https://johndoe.blog/post1
+    - https://johndoe.blog/post2
+    # github Repositories
-    # blog Posts
-    - https://johndoe.blog/post1
-    - https://johndoe.blog/post2
-    # github Repositories
+    # Blog Posts
+    - https://johndoe.blog/post1
+    - https://johndoe.blog/post2
+    # GitHub Repositories
-    # github Repositories
+    # GitHub Repositories
-    # blog Posts
-    - https://johndoe.blog/post1
-    - https://johndoe.blog/post2
-    # github Repositories
+    # Blog Posts
+    - https://johndoe.blog/post1
+    - https://johndoe.blog/post2
+    # GitHub Repositories
-    # github Repositories
+    # GitHub Repositories
+    - https://github.com/johndoe/repo1
+    - https://github.com/johndoe/repo2
+    - https://github.com/johndoe/awesome-project
diff --git a/configs/feature_engineering.yaml.example b/configs/feature_engineering.yaml.example
@@ -0,0 +1,4 @@
+parameters:
+  author_full_names:
+    - John Doe
+    - Jane Smith
diff --git a/core/application/rag/query_expansion.py b/core/application/rag/query_expansion.py
@@ -21,6 +21,7 @@ def generate(self, query: Query, expand_to_n: int) -> list[Query]:
         prompt = query_expansion_template.create_template(expand_to_n - 1)  # excluding the original prompt
         # todo: handle token limit exceeded error
         # todo: localize the model initialization
+        # use either Ollama or OpenAI model
         model = ChatOllama(model=settings.OLLAMA_MODEL_ID, temperature=0)
         # model = ChatOpenAI(model=settings.OPENAI_MODEL_ID,
         #                    api_key=settings.OPENAI_API_KEY,

diff --git a/core/application/rag/self_query.py b/core/application/rag/self_query.py
@@ -18,6 +18,7 @@ def generate(self, query: Query) -> Query:
             return query
 
         prompt = SelfQueryTemplate().create_template()
+        # use either Ollama or OpenAI model
         model = ChatOllama(model=settings.OLLAMA_MODEL_ID, temperature=0)
         # model = ChatOpenAI(model=settings.OPENAI_MODEL_ID,
         #                    api_key=settings.OPENAI_API_KEY,
@@ -40,7 +41,7 @@ def generate(self, query: Query) -> Query:
 
 # python -m core.application.rag.self_query
 if __name__ == "__main__":
-    query = Query.from_str("I am Huyen Chip. Write an article about the best types of advanced RAG methods.")
+    query = Query.from_str("I am <Author Name>. Write an article about the best types of advanced RAG methods.")
     self_query = SelfQuery()
     query = self_query.generate(query)
     logger.info(f"Extracted author_id: {query.author_id}")

diff --git a/core/infrastructure/inference_pipeline_api.py b/core/infrastructure/inference_pipeline_api.py
@@ -8,7 +8,7 @@
 from core.application.utils import misc
 from core.domain.embedded_chunks import EmbeddedChunk
 from core.infrastructure.opik_utils import configure_opik
-from core.model.inference import InferenceExecutor, LLMInferenceTransformersLocal
+from core.model.inference import InferenceExecutor, LLMInferenceTransformers
 
 configure_opik()
 
@@ -25,7 +25,7 @@ class QueryResponse(BaseModel):
 
 @opik.track
 def call_llm_service(query: str, context: str | None) -> str:
-    llm = LLMInferenceTransformersLocal(
+    llm = LLMInferenceTransformers(
         model_id=settings.HUGGINGFACE_INFERENCE_MODEL_ID,
     )
     answer = InferenceExecutor(llm, query, context).execute()

diff --git a/core/model/finetuning/llm_ghostwriter_finetune_dpo.ipynb b/core/model/finetuning/llm_ghostwriter_finetune_dpo.ipynb
@@ -563,10 +563,19 @@
      ]
     },
     "id": "MTQnOypRl4pS",
+    "jupyter": {
+     "is_executing": true
+    },
     "outputId": "1683fb7a-37ce-41bc-ec6f-84045308a1c0"
    },
    "outputs": [],
    "source": [
+    "# use low learning rate for DPO training\n",
+    "# otherwise it might cause corrupted responses during generation.\n",
+    "# larger learning rates and longer update steps are more likely to trigger this issue.\n",
+    "# https://github.com/huggingface/trl/issues/1025#issuecomment-1826268148\n",
+    "# https://kyunghyuncho.me/a-proper-preference-optimization-loss-and-its-gradient/\n",
+    "\n",
     "trainer = DPOTrainer(\n",
     "    model=model,\n",
     "    ref_model=None,\n",
@@ -577,7 +586,7 @@
     "    max_length=max_seq_length // 2,\n",
     "    max_prompt_length=max_seq_length // 2,\n",
     "    args=DPOConfig(\n",
-    "        learning_rate=2e-6,\n",
+    "        learning_rate=2e-6, # https://github.com/huggingface/trl/issues/1025#issuecomment-1826268148\n",
     "        lr_scheduler_type=\"linear\",\n",
     "        per_device_train_batch_size=2,\n",
     "        per_device_eval_batch_size=2,\n",
@@ -642,7 +651,18 @@
     "inputs = tokenizer([message], return_tensors=\"pt\").to(\"cuda\")\n",
     "\n",
     "text_streamer = TextStreamer(tokenizer)\n",
-    "_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)"
+    "_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)\n",
+    "\n",
+    "# \"\"\"\n",
+    "# <|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+    "#\n",
+    "# ### Instruction:\n",
+    "# Write a paragraph to introduce supervised fine-tuning.\n",
+    "#\n",
+    "# ### Response:\n",
+    "# Supervised fine-tuning is a technique used in natural language processing and machine learning to improve the performance of a model by training it on a large dataset of labeled examples. This process involves feeding the model examples of correct and incorrect responses, and using this information to adjust the model's parameters to better predict the correct response. Supervised fine-tuning is commonly used in tasks such as text classification, sentiment analysis, and machine translation, where the goal is to train the model to accurately classify or translate between languages. By fine-tuning the model on a labeled dataset, the model learns to better recognize and respond to the specific prompts and expectations of the task at hand.<|end_of_text|>\n",
+    "#\n",
+    "# \"\"\""
    ]
   },
   {

diff --git a/core/model/finetuning/llm_ghostwriter_finetune_sft.ipynb b/core/model/finetuning/llm_ghostwriter_finetune_sft.ipynb
@@ -610,7 +610,19 @@
     "inputs = tokenizer([message], return_tensors=\"pt\").to(\"cuda\")\n",
     "\n",
     "text_streamer = TextStreamer(tokenizer)\n",
-    "_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)"
+    "_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)\n",
+    "\n",
+    "\n",
+    "# \"\"\"\n",
+    "# <|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+    "#\n",
+    "# ### Instruction:\n",
+    "# Write a paragraph to introduce supervised fine-tuning.\n",
+    "#\n",
+    "# ### Response:\n",
+    "# Supervised fine-tuning is a technique used to improve a language model by teaching it to predict the next word based on examples provided by a labeled dataset. This process involves training the model to understand the relationships between words and how to predict the next one given a set of previous tokens. Supervised fine-tuning is commonly used with language models like GPT-3 and is considered a state-of-the-art approach for text generation.<|end_of_text|>\n",
+    "#\n",
+    "# \"\"\""
    ]
   },
   {

diff --git a/core/model/inference/__init__.py b/core/model/inference/__init__.py
@@ -1,4 +1,4 @@
-from .inference import LLMInferenceTransformersLocal
+from .inference import LLMInferenceTransformers
 from .run import InferenceExecutor
 
-__all__ = ["InferenceExecutor", "LLMInferenceTransformersLocal"]
+__all__ = ["InferenceExecutor", "LLMInferenceTransformers"]
diff --git a/core/model/inference/inference.py b/core/model/inference/inference.py
@@ -34,7 +34,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
         return False
 
 
-class LLMInferenceTransformersLocal(Inference):
+class LLMInferenceTransformers(Inference):
     """Huggingface Transformers local model inference"""
 
     _shared_model = None
@@ -53,7 +53,6 @@ def __init__(
 
         self.model_id = model_id or settings.HUGGINGFACE_INFERENCE_MODEL_ID
 
-        # --- Device & dtype (safer defaults on MPS) ---
         self._device = device or settings.HF_MODEL_DEVICE
         if torch_dtype is None:
             if self._device == "mps":

diff --git a/core/model/inference/test.py b/core/model/inference/test.py
@@ -1,6 +1,6 @@
 from loguru import logger
 
-from core.model.inference.inference import LLMInferenceTransformersLocal
+from core.model.inference.inference import LLMInferenceTransformers
 from core.model.inference.run import InferenceExecutor
 from core.settings import settings
 
@@ -19,7 +19,7 @@
 ### Response:
 """
     logger.info(f"Running inference for text: '{text}'")
-    llm = LLMInferenceTransformersLocal(
+    llm = LLMInferenceTransformers(
         model_id=settings.HUGGINGFACE_INFERENCE_MODEL_ID,
     )
     answer = InferenceExecutor(llm, text).execute()