Skip to content

Commit 2a39025

Browse files
authored
Merge pull request #6 from aosus/copilot/fix-5
Improve Matrix reply handling using structured data instead of markdown parsing
2 parents b77a512 + 0deffa9 commit 2a39025

3 files changed

Lines changed: 87 additions & 1 deletion

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
venv/
22
.vscode/
33
.env
4+
__pycache__/
5+
*.pyc
20.6 KB
Binary file not shown.

matrix_bot.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,88 @@ def find_links_in_text(text):
199199
return url_pattern.findall(text)
200200

201201

202+
def extract_reply_content_from_formatted_body(formatted_body):
203+
"""Extract actual reply content from Matrix formatted_body, excluding quoted content.
204+
205+
Matrix replies include quoted content wrapped in <mx-reply> tags.
206+
This function removes the <mx-reply> section and returns just the actual reply.
207+
"""
208+
if not formatted_body:
209+
return None
210+
211+
# Remove <mx-reply>...</mx-reply> section (including nested content)
212+
# Use DOTALL flag to match across newlines
213+
reply_pattern = r'<mx-reply>.*?</mx-reply>'
214+
clean_content = re.sub(reply_pattern, '', formatted_body, flags=re.DOTALL | re.IGNORECASE)
215+
216+
# Extract URLs from href attributes before removing HTML tags
217+
href_pattern = r'href=(["\'])([^"\']*)\1'
218+
href_urls = re.findall(href_pattern, clean_content, re.IGNORECASE)
219+
220+
# Clean up HTML tags but preserve the text content
221+
clean_content = re.sub(r'<[^>]+>', '', clean_content)
222+
223+
# Add back the URLs that were in href attributes
224+
if href_urls:
225+
url_list = [url[1] for url in href_urls] # url[1] is the actual URL from the regex groups
226+
clean_content = clean_content.strip() + ' ' + ' '.join(url_list)
227+
228+
return clean_content.strip()
229+
230+
231+
def has_reply_relationship(event_source):
232+
"""Check if the event is a reply by looking for m.relates_to."""
233+
if not isinstance(event_source, dict):
234+
return False
235+
content = event_source.get('content', {})
236+
relates_to = content.get('m.relates_to', {})
237+
return 'm.in_reply_to' in relates_to
238+
239+
240+
def get_content_for_link_processing(event):
241+
"""Get the appropriate content for link processing based on message type.
242+
243+
For replies: Extract content excluding quoted portions using structured data.
244+
For regular messages: Use the message body directly.
245+
"""
246+
# Check if this is a reply using structured data
247+
if has_reply_relationship(event.source):
248+
# For replies, try to extract clean content from formatted_body
249+
if hasattr(event, 'formatted_body') and event.formatted_body:
250+
clean_content = extract_reply_content_from_formatted_body(event.formatted_body)
251+
if clean_content:
252+
return clean_content
253+
254+
# Fallback to quote-based filtering if no formatted_body
255+
return find_links_excluding_quotes_fallback(event.body)
256+
257+
# For non-replies, process the entire message body
258+
return event.body
259+
260+
261+
def find_links_excluding_quotes_fallback(text):
262+
"""Fallback method: Finds URLs in text but excludes URLs from quoted sections.
263+
264+
This is used when formatted_body is not available.
265+
Matrix replies often include quoted content in the message body using fallback format:
266+
> <@user:example.com> Original message with URLs
267+
268+
This function excludes URLs that appear in such quoted sections (lines starting with '>').
269+
"""
270+
lines = text.split('\n')
271+
non_quoted_lines = []
272+
273+
for line in lines:
274+
# Skip lines that start with '>' (Matrix quote format)
275+
# Also handle lines that start with whitespace + '>'
276+
stripped = line.lstrip()
277+
if not stripped.startswith('>'):
278+
non_quoted_lines.append(line)
279+
280+
# Join the non-quoted lines and return the text for processing
281+
return '\n'.join(non_quoted_lines)
282+
283+
202284
async def main():
203285
load_dotenv()
204286
load_config_data()
@@ -286,7 +368,9 @@ async def message_handler_callback(room: MatrixRoom, event: RoomMessageText):
286368
logger.debug("Message is from self, ignoring.")
287369
return
288370

289-
found_links = find_links_in_text(event.body)
371+
# Get the appropriate content for link processing
372+
content_to_process = get_content_for_link_processing(event)
373+
found_links = find_links_in_text(content_to_process)
290374
if not found_links:
291375
logger.debug("No links found in message.")
292376
return

0 commit comments

Comments
 (0)