@@ -199,6 +199,88 @@ def find_links_in_text(text):
199199 return url_pattern .findall (text )
200200
201201
202+ def extract_reply_content_from_formatted_body (formatted_body ):
203+ """Extract actual reply content from Matrix formatted_body, excluding quoted content.
204+
205+ Matrix replies include quoted content wrapped in <mx-reply> tags.
206+ This function removes the <mx-reply> section and returns just the actual reply.
207+ """
208+ if not formatted_body :
209+ return None
210+
211+ # Remove <mx-reply>...</mx-reply> section (including nested content)
212+ # Use DOTALL flag to match across newlines
213+ reply_pattern = r'<mx-reply>.*?</mx-reply>'
214+ clean_content = re .sub (reply_pattern , '' , formatted_body , flags = re .DOTALL | re .IGNORECASE )
215+
216+ # Extract URLs from href attributes before removing HTML tags
217+ href_pattern = r'href=(["\'])([^"\']*)\1'
218+ href_urls = re .findall (href_pattern , clean_content , re .IGNORECASE )
219+
220+ # Clean up HTML tags but preserve the text content
221+ clean_content = re .sub (r'<[^>]+>' , '' , clean_content )
222+
223+ # Add back the URLs that were in href attributes
224+ if href_urls :
225+ url_list = [url [1 ] for url in href_urls ] # url[1] is the actual URL from the regex groups
226+ clean_content = clean_content .strip () + ' ' + ' ' .join (url_list )
227+
228+ return clean_content .strip ()
229+
230+
231+ def has_reply_relationship (event_source ):
232+ """Check if the event is a reply by looking for m.relates_to."""
233+ if not isinstance (event_source , dict ):
234+ return False
235+ content = event_source .get ('content' , {})
236+ relates_to = content .get ('m.relates_to' , {})
237+ return 'm.in_reply_to' in relates_to
238+
239+
240+ def get_content_for_link_processing (event ):
241+ """Get the appropriate content for link processing based on message type.
242+
243+ For replies: Extract content excluding quoted portions using structured data.
244+ For regular messages: Use the message body directly.
245+ """
246+ # Check if this is a reply using structured data
247+ if has_reply_relationship (event .source ):
248+ # For replies, try to extract clean content from formatted_body
249+ if hasattr (event , 'formatted_body' ) and event .formatted_body :
250+ clean_content = extract_reply_content_from_formatted_body (event .formatted_body )
251+ if clean_content :
252+ return clean_content
253+
254+ # Fallback to quote-based filtering if no formatted_body
255+ return find_links_excluding_quotes_fallback (event .body )
256+
257+ # For non-replies, process the entire message body
258+ return event .body
259+
260+
261+ def find_links_excluding_quotes_fallback (text ):
262+ """Fallback method: Finds URLs in text but excludes URLs from quoted sections.
263+
264+ This is used when formatted_body is not available.
265+ Matrix replies often include quoted content in the message body using fallback format:
266+ > <@user:example.com> Original message with URLs
267+
268+ This function excludes URLs that appear in such quoted sections (lines starting with '>').
269+ """
270+ lines = text .split ('\n ' )
271+ non_quoted_lines = []
272+
273+ for line in lines :
274+ # Skip lines that start with '>' (Matrix quote format)
275+ # Also handle lines that start with whitespace + '>'
276+ stripped = line .lstrip ()
277+ if not stripped .startswith ('>' ):
278+ non_quoted_lines .append (line )
279+
280+ # Join the non-quoted lines and return the text for processing
281+ return '\n ' .join (non_quoted_lines )
282+
283+
202284async def main ():
203285 load_dotenv ()
204286 load_config_data ()
@@ -286,7 +368,9 @@ async def message_handler_callback(room: MatrixRoom, event: RoomMessageText):
286368 logger .debug ("Message is from self, ignoring." )
287369 return
288370
289- found_links = find_links_in_text (event .body )
371+ # Get the appropriate content for link processing
372+ content_to_process = get_content_for_link_processing (event )
373+ found_links = find_links_in_text (content_to_process )
290374 if not found_links :
291375 logger .debug ("No links found in message." )
292376 return
0 commit comments