RAGGA/config.yaml at main · zeyus-research/RAGGA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
encoder:
  model_path: "sentence-transformers/all-MiniLM-l6-v2"
  cache_embeddings: True
  persistent_cache: True # if True, embeddings are cached in cache_dir, if False, they are cached in memory
  cache_dir: ".cache"

  model_kwargs:
    device: "cpu" # "cuda" for NVIDIA GPU or "cpu"

  encode_kwargs:
    normalize_embeddings: False

retriever:
  splitting:
    chunk_size: 256
    chunk_overlap: 64
  num_docs: 8
  cache_dir: ".cache"

generator:
  llm_path: "models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
  context_length: 2048
  temperature: 0.8
  max_tokens: 256
  gpu_layers: 0 # number of layers to offload to the GPU (if available), null for all layers
  model_kwargs: {}
  search_kwargs: {}
  similarity_threshold: 0.75
  compress: False # Not un use. Eventually the context can be compressed to remove redundant information
  llama: False # true if model is a llama model
  autoflush: True # automatically clear the model stdout and stderr buffers, False to manually control flushing

dataset:
  path: "./data"
  recursive: True
  show_progress: True # show progress bar when loading dataset
  use_multithreading: True
  loader_kwargs:
    mode: "single" # either "single" or "elements"
    unstructured_kwargs:
      include_metadata: True
      languages: ["en"] # language can be "auto", or a list of known languages
      chunking_strategy: "by_title" # "by_title" or null (no chunking)
prompt:
  user_name: "User"
  AI_name: "Assistant"
  instruct_user: "Instruct" # usually "System", or "Instruct"
  instructions: "You are a friendly, helpful AI assistant. Succinctly and truthfully answer the user's question "
  pre_context: "using the following documents"
  post_context: ""
  user_query: "{question}"