|
| 1 | +# pip install -q -U google-genai to use gemini as a client |
| 2 | + |
| 3 | +import os |
| 4 | +from typing import Optional |
| 5 | +import dataclasses |
| 6 | +from pathlib import Path |
| 7 | +import hashlib |
| 8 | +import numpy as np |
| 9 | +from google import genai |
| 10 | +from google.genai import types |
| 11 | +from dotenv import load_dotenv |
| 12 | +from lightrag.utils import EmbeddingFunc, Tokenizer |
| 13 | +from lightrag import LightRAG, QueryParam |
| 14 | +from sentence_transformers import SentenceTransformer |
| 15 | +from lightrag.kg.shared_storage import initialize_pipeline_status |
| 16 | +import sentencepiece as spm |
| 17 | +import requests |
| 18 | + |
| 19 | +import asyncio |
| 20 | +import nest_asyncio |
| 21 | + |
| 22 | +# Apply nest_asyncio to solve event loop issues |
| 23 | +nest_asyncio.apply() |
| 24 | + |
| 25 | +load_dotenv() |
| 26 | +gemini_api_key = os.getenv("GEMINI_API_KEY") |
| 27 | + |
| 28 | +WORKING_DIR = "./dickens" |
| 29 | + |
| 30 | +if os.path.exists(WORKING_DIR): |
| 31 | + import shutil |
| 32 | + |
| 33 | + shutil.rmtree(WORKING_DIR) |
| 34 | + |
| 35 | +os.mkdir(WORKING_DIR) |
| 36 | + |
| 37 | + |
| 38 | +class GemmaTokenizer(Tokenizer): |
| 39 | + # adapted from google-cloud-aiplatform[tokenization] |
| 40 | + |
| 41 | + @dataclasses.dataclass(frozen=True) |
| 42 | + class _TokenizerConfig: |
| 43 | + tokenizer_model_url: str |
| 44 | + tokenizer_model_hash: str |
| 45 | + |
| 46 | + _TOKENIZERS = { |
| 47 | + "google/gemma2": _TokenizerConfig( |
| 48 | + tokenizer_model_url="https://raw.githubusercontent.com/google/gemma_pytorch/33b652c465537c6158f9a472ea5700e5e770ad3f/tokenizer/tokenizer.model", |
| 49 | + tokenizer_model_hash="61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2", |
| 50 | + ), |
| 51 | + "google/gemma3": _TokenizerConfig( |
| 52 | + tokenizer_model_url="https://raw.githubusercontent.com/google/gemma_pytorch/cb7c0152a369e43908e769eb09e1ce6043afe084/tokenizer/gemma3_cleaned_262144_v2.spiece.model", |
| 53 | + tokenizer_model_hash="1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c", |
| 54 | + ), |
| 55 | + } |
| 56 | + |
| 57 | + def __init__( |
| 58 | + self, model_name: str = "gemini-2.0-flash", tokenizer_dir: Optional[str] = None |
| 59 | + ): |
| 60 | + # https://github.com/google/gemma_pytorch/tree/main/tokenizer |
| 61 | + if "1.5" in model_name or "1.0" in model_name: |
| 62 | + # up to gemini 1.5 gemma2 is a comparable local tokenizer |
| 63 | + # https://github.com/googleapis/python-aiplatform/blob/main/vertexai/tokenization/_tokenizer_loading.py |
| 64 | + tokenizer_name = "google/gemma2" |
| 65 | + else: |
| 66 | + # for gemini > 2.0 gemma3 was used |
| 67 | + tokenizer_name = "google/gemma3" |
| 68 | + |
| 69 | + file_url = self._TOKENIZERS[tokenizer_name].tokenizer_model_url |
| 70 | + tokenizer_model_name = file_url.rsplit("/", 1)[1] |
| 71 | + expected_hash = self._TOKENIZERS[tokenizer_name].tokenizer_model_hash |
| 72 | + |
| 73 | + tokenizer_dir = Path(tokenizer_dir) |
| 74 | + if tokenizer_dir.is_dir(): |
| 75 | + file_path = tokenizer_dir / tokenizer_model_name |
| 76 | + model_data = self._maybe_load_from_cache( |
| 77 | + file_path=file_path, expected_hash=expected_hash |
| 78 | + ) |
| 79 | + else: |
| 80 | + model_data = None |
| 81 | + if not model_data: |
| 82 | + model_data = self._load_from_url( |
| 83 | + file_url=file_url, expected_hash=expected_hash |
| 84 | + ) |
| 85 | + self.save_tokenizer_to_cache(cache_path=file_path, model_data=model_data) |
| 86 | + |
| 87 | + tokenizer = spm.SentencePieceProcessor() |
| 88 | + tokenizer.LoadFromSerializedProto(model_data) |
| 89 | + super().__init__(model_name=model_name, tokenizer=tokenizer) |
| 90 | + |
| 91 | + def _is_valid_model(self, model_data: bytes, expected_hash: str) -> bool: |
| 92 | + """Returns true if the content is valid by checking the hash.""" |
| 93 | + return hashlib.sha256(model_data).hexdigest() == expected_hash |
| 94 | + |
| 95 | + def _maybe_load_from_cache(self, file_path: Path, expected_hash: str) -> bytes: |
| 96 | + """Loads the model data from the cache path.""" |
| 97 | + if not file_path.is_file(): |
| 98 | + return |
| 99 | + with open(file_path, "rb") as f: |
| 100 | + content = f.read() |
| 101 | + if self._is_valid_model(model_data=content, expected_hash=expected_hash): |
| 102 | + return content |
| 103 | + |
| 104 | + # Cached file corrupted. |
| 105 | + self._maybe_remove_file(file_path) |
| 106 | + |
| 107 | + def _load_from_url(self, file_url: str, expected_hash: str) -> bytes: |
| 108 | + """Loads model bytes from the given file url.""" |
| 109 | + resp = requests.get(file_url) |
| 110 | + resp.raise_for_status() |
| 111 | + content = resp.content |
| 112 | + |
| 113 | + if not self._is_valid_model(model_data=content, expected_hash=expected_hash): |
| 114 | + actual_hash = hashlib.sha256(content).hexdigest() |
| 115 | + raise ValueError( |
| 116 | + f"Downloaded model file is corrupted." |
| 117 | + f" Expected hash {expected_hash}. Got file hash {actual_hash}." |
| 118 | + ) |
| 119 | + return content |
| 120 | + |
| 121 | + @staticmethod |
| 122 | + def save_tokenizer_to_cache(cache_path: Path, model_data: bytes) -> None: |
| 123 | + """Saves the model data to the cache path.""" |
| 124 | + try: |
| 125 | + if not cache_path.is_file(): |
| 126 | + cache_dir = cache_path.parent |
| 127 | + cache_dir.mkdir(parents=True, exist_ok=True) |
| 128 | + with open(cache_path, "wb") as f: |
| 129 | + f.write(model_data) |
| 130 | + except OSError: |
| 131 | + # Don't raise if we cannot write file. |
| 132 | + pass |
| 133 | + |
| 134 | + @staticmethod |
| 135 | + def _maybe_remove_file(file_path: Path) -> None: |
| 136 | + """Removes the file if exists.""" |
| 137 | + if not file_path.is_file(): |
| 138 | + return |
| 139 | + try: |
| 140 | + file_path.unlink() |
| 141 | + except OSError: |
| 142 | + # Don't raise if we cannot remove file. |
| 143 | + pass |
| 144 | + |
| 145 | + # def encode(self, content: str) -> list[int]: |
| 146 | + # return self.tokenizer.encode(content) |
| 147 | + |
| 148 | + # def decode(self, tokens: list[int]) -> str: |
| 149 | + # return self.tokenizer.decode(tokens) |
| 150 | + |
| 151 | + |
| 152 | +async def llm_model_func( |
| 153 | + prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs |
| 154 | +) -> str: |
| 155 | + # 1. Initialize the GenAI Client with your Gemini API Key |
| 156 | + client = genai.Client(api_key=gemini_api_key) |
| 157 | + |
| 158 | + # 2. Combine prompts: system prompt, history, and user prompt |
| 159 | + if history_messages is None: |
| 160 | + history_messages = [] |
| 161 | + |
| 162 | + combined_prompt = "" |
| 163 | + if system_prompt: |
| 164 | + combined_prompt += f"{system_prompt}\n" |
| 165 | + |
| 166 | + for msg in history_messages: |
| 167 | + # Each msg is expected to be a dict: {"role": "...", "content": "..."} |
| 168 | + combined_prompt += f"{msg['role']}: {msg['content']}\n" |
| 169 | + |
| 170 | + # Finally, add the new user prompt |
| 171 | + combined_prompt += f"user: {prompt}" |
| 172 | + |
| 173 | + # 3. Call the Gemini model |
| 174 | + response = client.models.generate_content( |
| 175 | + model="gemini-1.5-flash", |
| 176 | + contents=[combined_prompt], |
| 177 | + config=types.GenerateContentConfig(max_output_tokens=500, temperature=0.1), |
| 178 | + ) |
| 179 | + |
| 180 | + # 4. Return the response text |
| 181 | + return response.text |
| 182 | + |
| 183 | + |
| 184 | +async def embedding_func(texts: list[str]) -> np.ndarray: |
| 185 | + model = SentenceTransformer("all-MiniLM-L6-v2") |
| 186 | + embeddings = model.encode(texts, convert_to_numpy=True) |
| 187 | + return embeddings |
| 188 | + |
| 189 | + |
| 190 | +async def initialize_rag(): |
| 191 | + rag = LightRAG( |
| 192 | + working_dir=WORKING_DIR, |
| 193 | + # tiktoken_model_name="gpt-4o-mini", |
| 194 | + tokenizer=GemmaTokenizer( |
| 195 | + tokenizer_dir=(Path(WORKING_DIR) / "vertexai_tokenizer_model"), |
| 196 | + model_name="gemini-2.0-flash", |
| 197 | + ), |
| 198 | + llm_model_func=llm_model_func, |
| 199 | + embedding_func=EmbeddingFunc( |
| 200 | + embedding_dim=384, |
| 201 | + max_token_size=8192, |
| 202 | + func=embedding_func, |
| 203 | + ), |
| 204 | + ) |
| 205 | + |
| 206 | + await rag.initialize_storages() |
| 207 | + await initialize_pipeline_status() |
| 208 | + |
| 209 | + return rag |
| 210 | + |
| 211 | + |
| 212 | +def main(): |
| 213 | + # Initialize RAG instance |
| 214 | + rag = asyncio.run(initialize_rag()) |
| 215 | + file_path = "story.txt" |
| 216 | + with open(file_path, "r") as file: |
| 217 | + text = file.read() |
| 218 | + |
| 219 | + rag.insert(text) |
| 220 | + |
| 221 | + response = rag.query( |
| 222 | + query="What is the main theme of the story?", |
| 223 | + param=QueryParam(mode="hybrid", top_k=5, response_type="single line"), |
| 224 | + ) |
| 225 | + |
| 226 | + print(response) |
| 227 | + |
| 228 | + |
| 229 | +if __name__ == "__main__": |
| 230 | + main() |
0 commit comments