1
1
import requests
2
2
from bs4 import BeautifulSoup
3
+ from memory import get_memory
3
4
from config import Config
4
5
from llm_utils import create_chat_completion
5
6
from urllib .parse import urlparse , urljoin
6
7
7
8
cfg = Config ()
9
+ memory = get_memory (cfg )
10
+
11
+ session = requests .Session ()
12
+ session .headers .update ({'User-Agent' : cfg .user_agent })
8
13
9
14
10
15
# Function to check if the URL is valid
@@ -27,7 +32,7 @@ def check_local_file_access(url):
27
32
return any (url .startswith (prefix ) for prefix in local_prefixes )
28
33
29
34
30
- def get_response (url , headers = cfg . user_agent_header , timeout = 10 ):
35
+ def get_response (url , timeout = 10 ):
31
36
try :
32
37
# Restrict access to local files
33
38
if check_local_file_access (url ):
@@ -39,7 +44,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):
39
44
40
45
sanitized_url = sanitize_url (url )
41
46
42
- response = requests .get (sanitized_url , headers = headers , timeout = timeout )
47
+ response = session .get (sanitized_url , timeout = timeout )
43
48
44
49
# Check if the response contains an HTTP error
45
50
if response .status_code >= 400 :
@@ -106,7 +111,7 @@ def scrape_links(url):
106
111
return format_hyperlinks (hyperlinks )
107
112
108
113
109
- def split_text (text , max_length = 8192 ):
114
+ def split_text (text , max_length = cfg . browse_chunk_max_length ):
110
115
"""Split text into chunks of a maximum length"""
111
116
paragraphs = text .split ("\n " )
112
117
current_length = 0
@@ -133,7 +138,7 @@ def create_message(chunk, question):
133
138
}
134
139
135
140
136
- def summarize_text (text , question ):
141
+ def summarize_text (url , text , question ):
137
142
"""Summarize text using the LLM model"""
138
143
if not text :
139
144
return "Error: No text to summarize"
@@ -145,15 +150,28 @@ def summarize_text(text, question):
145
150
chunks = list (split_text (text ))
146
151
147
152
for i , chunk in enumerate (chunks ):
153
+ print (f"Adding chunk { i + 1 } / { len (chunks )} to memory" )
154
+
155
+ memory_to_add = f"Source: { url } \n " \
156
+ f"Raw content part#{ i + 1 } : { chunk } "
157
+
158
+ memory .add (memory_to_add )
159
+
148
160
print (f"Summarizing chunk { i + 1 } / { len (chunks )} " )
149
161
messages = [create_message (chunk , question )]
150
162
151
163
summary = create_chat_completion (
152
164
model = cfg .fast_llm_model ,
153
165
messages = messages ,
154
- max_tokens = 300 ,
166
+ max_tokens = cfg . browse_summary_max_token ,
155
167
)
156
168
summaries .append (summary )
169
+ print (f"Added chunk { i + 1 } summary to memory" )
170
+
171
+ memory_to_add = f"Source: { url } \n " \
172
+ f"Content summary part#{ i + 1 } : { summary } "
173
+
174
+ memory .add (memory_to_add )
157
175
158
176
print (f"Summarized { len (chunks )} chunks." )
159
177
@@ -163,7 +181,7 @@ def summarize_text(text, question):
163
181
final_summary = create_chat_completion (
164
182
model = cfg .fast_llm_model ,
165
183
messages = messages ,
166
- max_tokens = 300 ,
184
+ max_tokens = cfg . browse_summary_max_token ,
167
185
)
168
186
169
187
return final_summary
0 commit comments