From e5bdaf09f8ed550432a69e03da1c966d4520b048 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Tue, 20 May 2025 06:57:28 -0700 Subject: [PATCH 01/33] websearch pipeline --- examples/websearch/config.yaml | 8 + .../websearch/enhanced_results_trial.json | 234 ++++++++++++++++++ src/mmore/__main__.py | 66 ++++- src/mmore/run_websearch.py | 8 + src/mmore/websearch/__init__.py | 6 + src/mmore/websearch/config.py | 42 ++++ src/mmore/websearch/llm.py | 57 +++++ src/mmore/websearch/pipeline.py | 234 ++++++++++++++++++ 8 files changed, 652 insertions(+), 3 deletions(-) create mode 100644 examples/websearch/config.yaml create mode 100644 examples/websearch/enhanced_results_trial.json create mode 100644 src/mmore/run_websearch.py create mode 100644 src/mmore/websearch/__init__.py create mode 100644 src/mmore/websearch/config.py create mode 100644 src/mmore/websearch/llm.py create mode 100644 src/mmore/websearch/pipeline.py diff --git a/examples/websearch/config.yaml b/examples/websearch/config.yaml new file mode 100644 index 00000000..76fa4e71 --- /dev/null +++ b/examples/websearch/config.yaml @@ -0,0 +1,8 @@ +input_file: "examples/rag/output.json" +output_file: "examples/websearch/enhanced_results_trial.json" +n_loops: 2 +llm_name: "OpenMeditron/meditron3-8b" +max_searches: 10 +llm_config: + max_new_tokens: 2000 + temperature: 0.2 \ No newline at end of file diff --git a/examples/websearch/enhanced_results_trial.json b/examples/websearch/enhanced_results_trial.json new file mode 100644 index 00000000..63fe5bcf --- /dev/null +++ b/examples/websearch/enhanced_results_trial.json @@ -0,0 +1,234 @@ +[ + { + "query": "When was Barack Obama born?", + "RAG_summary": "1961", + "WEB_RAG_summary": "Barack Obama was born in 1961. He was born in Honolulu, Hawaii, to Ann Dunham and Barack Obama Sr. His father was a Kenyan economist, and his mother was an American anthropologist. Obama's parents divorced when he was two years old, and he was raised by his mother and maternal grandparents in Hawaii. He attended Punahou School, a private college preparatory school in Honolulu, and graduated in 1979. Obama then attended Occidental College in Los Angeles for two years before transferring to Columbia University in New York City, where he earned a Bachelor of Arts degree in political science in 1983. After college, Obama worked as a community organizer in Chicago for three years. He then attended Harvard Law School, where he earned a Juris Doctor (J.D.) degree in 1991. Obama's political career began in 1996, when he was elected to the Illinois State Senate. He served three terms in the state senate before being elected to the U.S. Senate in 2004. Obama was the first African American to be elected to the U.S. Senate from Illinois. In 2008, Obama was elected as the 44th President of the United States, becoming the first African American to hold the office. He was re-elected in 2012. Obama's presidency was marked by several significant accomplishments, including the passage of the Affordable Care Act (ACA), also known as Obamacare, and the Dodd-Frank Wall Street Reform and Consumer Protection Act. He also oversaw the end of the Iraq War and the beginning of the end of the War in Afghanistan. Obama was awarded the Nobel Peace Prize in 2009 for his efforts to strengthen international diplomacy and cooperation. After leaving office in 2017, Obama has remained active in politics, advocating for progressive causes and supporting Democratic candidates in the 2018 midterm elections. He has also written several books, including Dreams from My Father and The Audacity of Hope.", + "WEBSEARCH_details": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii. His father, Barack Obama Sr., was a Kenyan economist, and his mother, Ann Dunham, was an American anthropologist. Obama's parents divorced when he was two years old, and he was raised by his mother and maternal grandparents in Hawaii. He attended Punahou School, a private college preparatory school in Honolulu, and graduated in 1979. Obama then attended Occidental College in Los Angeles for two years before transferring to Columbia University in New York City, where he earned a Bachelor of Arts degree in political science in 1983. After college, Obama worked as a community organizer in Chicago for three years. He then attended Harvard Law School, where he earned a Juris Doctor (J.D.) degree in 1991. Obama's political career began in 1996, when he was elected to the Illinois State Senate. He served three terms in the state senate before being elected to the U.S. Senate in 2004. Obama was the first African American to be elected to the U.S. Senate from Illinois. In 2008, Obama was elected as the 44th President of the United States, becoming the first African American to hold the office. He was re-elected in 2012. Obama's presidency was marked by several significant accomplishments, including the passage of the Affordable Care Act (ACA), also known as Obamacare, and the Dodd-Frank Wall Street Reform and Consumer Protection Act. He also oversaw the end of the Iraq War and the beginning of the end of the War in Afghanistan. Obama was awarded the Nobel Peace Prize in 2009 for his efforts to strengthen international diplomacy and cooperation. After leaving office in 2017, Obama has remained active in politics, advocating for progressive causes and supporting Democratic candidates in the 2018 midterm elections. He has also written several books, including Dreams from My Father and The Audacity of Hope.", + "sources": [ + { + "title": "Early life and career of Barack Obama - Wikipedia", + "url": "https://en.wikipedia.org/wiki/Early_life_and_career_of_Barack_Obama" + }, + { + "title": "Barack Obama | Biography, Parents, Education, Presidency, Books ...", + "url": "https://www.britannica.com/biography/Barack-Obama" + }, + { + "title": "Kenyan Government Releases Obama's Real Birth Certificate", + "url": "https://www.snopes.com/fact-check/birthing-pains/" + }, + { + "title": "Barack Obama: Biography, 44th U.S. President, Politician", + "url": "https://www.biography.com/political-figures/barack-obama" + }, + { + "title": "Barack Obama - Age, Education & Mother - HISTORY", + "url": "https://www.history.com/articles/barack-obama" + }, + { + "title": "President Barack Obama | Barack Obama Presidential Library", + "url": "https://www.obamalibrary.gov/obamas/president-barack-obama" + }, + { + "title": "Barack Obama | The White House", + "url": "https://bidenwhitehouse.archives.gov/about-the-white-house/presidents/barack-obama/" + }, + { + "title": "Barack Obama - National Geographic Kids", + "url": "https://kids.nationalgeographic.com/history/article/barack-obama" + }, + { + "title": "President Obama | The Obama Foundation", + "url": "https://www.obama.org/democracy-forum-2023/president-obama/" + }, + { + "title": "Barack Obama Timeline - Have Fun With History", + "url": "https://www.havefunwithhistory.com/barack-obama-timeline/" + }, + { + "title": "Barack Obama - Wikipedia", + "url": "https://en.wikipedia.org/wiki/Barack_Obama" + }, + { + "title": "10 Facts About Barack Obama - Have Fun With History", + "url": "https://www.havefunwithhistory.com/facts-about-barack-obama/" + } + ] + }, + { + "query": "Who founded Google?", + "RAG_summary": "Google was founded by Larry Page and Sergey Brin in 1998.", + "WEB_RAG_summary": "Google was founded by Larry Page and Sergey Brin in 1998. The founders were students at Stanford University in California. They created Google as a research project for their Ph.D. Thesis. The name Google was derived from the mathematical term \"googol,\" which means a huge number. Google was initially created to organize the world's information and make it universally accessible and useful. Google's first product was a search engine that allowed users to search the web for information. Google's search engine was unique because it used a new algorithm that ranked web pages based on their relevance to the search query. Google's search engine quickly became popular and the company began to expand its product offerings. Today, Google is a multinational technology company that specializes in Internet-related services and products. Google's main products include the Google search engine, Google Maps, Google Drive, Google Docs, Google Sheets, Google Slides, Google Calendar, Google Hangouts, Google Photos, Google Play, and Google Chrome. Google is headquartered in Mountain View, California, and is led by CEO Sundar Pichai. Google is a subsidiary of Alphabet Inc., a multinational conglomerate created in 2015 to manage Google and its other subsidiaries. Google is one of the most valuable companies in the world, with a market capitalization of over $1 trillion. Google's mission is to organize the world's information and make it universally accessible and useful. Google's vision is to be the most trusted and respected company in the world. Google's values include innovation, simplicity, and user-centricity. Google's products and services are designed to make people's lives easier and more productive. Google's products and services are used by billions of people around the world every day. Google's products and services are available in over 100 languages and are used in over 190 countries. Google's products and services are available on desktop computers, laptops, tablets, smartphones, and smart home devices. Google's products and services are free to use and are supported by advertising. Google's advertising products include Google Ads, Google AdSense, and Google Analytics. Google's advertising products allow businesses to reach their target audience and measure the effectiveness of their advertising campaigns. Google's advertising products are used by businesses of all sizes, from small businesses to multinational corporations. Google's advertising products are available in over 100 countries and are used by businesses in a wide range of industries. Google's advertising products are designed to be easy to use and provide businesses with the tools they need to succeed online. Google's advertising products are supported by a team of experts who provide businesses with the guidance and support they need to succeed. Google's advertising products are constantly evolving to meet the changing needs of businesses. Google's advertising products are designed to be transparent and provide businesses with the information they need to make informed decisions. Google's advertising products are designed to be fair and provide businesses with the opportunity to succeed. Google's advertising products are designed to be accountable and provide businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness", + "WEBSEARCH_details": "", + "sources": [ + { + "title": "History of Google - Wikipedia", + "url": "https://en.wikipedia.org/wiki/History_of_Google" + }, + { + "title": "How we started and where we are today - Google - Google - About Google", + "url": "https://about.google/company-info/our-story/" + }, + { + "title": "Google | History & Facts; Products & Services | Britannica Money", + "url": "https://www.britannica.com/money/Google-Inc" + }, + { + "title": "Larry Page | Biography, Google, & Facts | Britannica Money", + "url": "https://www.britannica.com/money/Larry-Page" + }, + { + "title": "Who Owns Google - Google Founders, Current CEO, & History", + "url": "https://www.feedough.com/who-owns-google-google-founders-current-ceo-history/" + }, + { + "title": "The History of Google and How It Was Invented - ThoughtCo", + "url": "https://www.thoughtco.com/who-invented-google-1991852" + }, + { + "title": "Who Invented Google? The Story Behind The Search Engine", + "url": "https://historycooperative.org/who-invented-google/" + }, + { + "title": "Sergey Brin - Education, Google & Wife - Biography", + "url": "https://www.biography.com/business-leaders/sergey-brin" + }, + { + "title": "Who Actually Invented Google and When? The Full, Fascinating History", + "url": "https://www.historytools.org/companies/who-actually-invented-google-and-when" + }, + { + "title": "Who Founded Google? - WorldAtlas", + "url": "https://www.worldatlas.com/articles/who-founded-google.html" + }, + { + "title": "The Complete History and Evolution of Google", + "url": "https://www.historytools.org/companies/google-guide" + } + ] + }, + { + "query": "Where is the Eiffel Tower located?", + "RAG_summary": "The Eiffel Tower is located in Paris, France.", + "WEB_RAG_summary": "The Eiffel Tower is located in Paris, France. It is one of the most famous landmarks in the world and is known for its unique design and stunning views of the city. The tower was built in 1889 for the World's Fair and was intended to be a temporary structure. However, it quickly became a symbol of Paris and has since been a popular tourist attraction. The tower is 324 meters tall and is made of wrought iron. It is also home to a restaurant and observation deck. 2.", + "WEBSEARCH_details": "The Eiffel Tower is located on the Champ de Mars in Paris, France. It is situated on the Seine River and is easily accessible by public transportation. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for", + "sources": [ + { + "title": "Eiffel Tower - Wikipedia", + "url": "https://en.wikipedia.org/wiki/Eiffel_Tower" + }, + { + "title": "Eiffel Tower | History, Height, & Facts | Britannica", + "url": "https://www.britannica.com/topic/Eiffel-Tower-Paris-France" + }, + { + "title": "Where is the Eiffel Tower? - World In Paris", + "url": "https://worldinparis.com/where-is-the-eiffel-tower" + }, + { + "title": "Where is Eiffel Tower Located? France", + "url": "https://www.whereig.com/wonders-of-the-world/where-is-eiffel-tower.html" + }, + { + "title": "The Eiffel Tower: all there is to know - Official website", + "url": "https://www.toureiffel.paris/en/the-monument" + }, + { + "title": "Where is The Eiffel Tower on the map? Exact location of The Eiffel ...", + "url": "https://www.city2map.com/en/where-is/eiffeltower.html" + }, + { + "title": "Discovering the Heart of Paris: Where is the Eiffel Tower Located?", + "url": "https://francefocusguide.com/blog/eiffel-tower-location-paris/" + }, + { + "title": "Location of the Eiffel tower - Wonders of the world", + "url": "https://www.wonders-of-the-world.net/Eiffel-Tower/Location-of-the-Eiffel-tower.php" + }, + { + "title": "Eiffel Tower Location In Paris: Access, Views ... - TravelPander", + "url": "https://travelpander.com/eiffel-tower-location-in-paris/" + }, + { + "title": "Eiffel Tower Location | Fastest, Cheapest & Scenic Options", + "url": "https://www.paristickets.com/eiffel-tower/location-getting-there-directions/" + }, + { + "title": "Eiffel Tower Location: A Parisian Icon in the Heart of France", + "url": "https://geographypin.com/eiffel-tower-location/" + }, + { + "title": "The Eiffel Tower - History and Facts | History Hit", + "url": "https://www.historyhit.com/locations/the-eiffel-tower/" + }, + { + "title": "Eiffel Tower: An Architecture Landmark to Visit in Paris", + "url": "https://www.architecturelab.net/architecture/landmark/eiffel-tower/" + }, + { + "title": "Information and facts about the Eiffel Tower", + "url": "https://www.eiffeltower-paris.info/information-and-facts/" + } + ] + }, + { + "query": "When will the artificial intelligence conference be held?", + "RAG_summary": "2024", + "WEB_RAG_summary": "The artificial intelligence conference will be held in 2025. The conference will be held in Chicago, IL, at Northwestern University. The conference is organized by the Association for the Advancement of Artificial Intelligence (AAAI). The conference will be held in 2025, and the exact date is not yet known. The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.", + "WEBSEARCH_details": "The 39th Annual AAAI Conference on Artificial Intelligence will be held in 2025. The conference will be held in Chicago, IL, at Northwestern University. The conference is organized by the Association for the Advancement of Artificial Intelligence (AAAI). The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.\nThe Top AI Conferences To Attend In 2025 is a list of the top AI conferences to attend in 2025. The list includes the International Conference on Artificial Intelligence (ICAI), the International Joint Conference on Artificial Intelligence (IJCAI), the Conference on Neural Information Processing Systems (NeurIPS), and the Conference on Computer Vision and Pattern Recognition (CVPR). The list also includes the International Conference on Machine Learning (ICML), the Conference on Human Factors in Computing Systems (CHI), and the Conference on Artificial Intelligence (AAAI).\nThe Best Artificial Intelligence Conferences & Events of 2025 is a list of the best AI conferences and events to attend in 2025. The list includes the International Conference on Artificial Intelligence (ICAI), the International Joint Conference on Artificial Intelligence (IJCAI), the Conference on Neural Information Processing Systems (NeurIPS), and the Conference on Computer Vision and Pattern Recognition (CVPR). The list also includes the International Conference on Machine Learning (ICML), the Conference on Human Factors in Computing Systems (CHI), and the Conference on Artificial Intelligence (AAAI).\nThe Future of AI: Top Global Conferences to Watch from 2025 to 2026 is a list of the top global AI conferences to watch from 2025 to 2026. The list includes the International Conference on Artificial Intelligence (ICAI), the International Joint Conference on Artificial Intelligence (IJCAI), the Conference on Neural Information Processing Systems (NeurIPS), and the Conference on Computer Vision and Pattern Recognition (CVPR). The list also includes the International Conference on Machine Learning (ICML), the Conference on Human Factors in Computing Systems (CHI), and the Conference on Artificial Intelligence (AAAI).\nIAIC2024 is an international conference on artificial intelligence that will be held in 2024. The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.\nThe 2025 MIT AI Conference is an annual conference on artificial intelligence that will be held in 2025. The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.", + "sources": [ + { + "title": "ICAIL 2025 | Chicago, IL - Northwestern University", + "url": "https://sites.northwestern.edu/icail2025/" + }, + { + "title": "The 39th Annual AAAI Conference on Artificial Intelligence", + "url": "https://aaai.org/conference/aaai/aaai-25/" + }, + { + "title": "The Top AI Conferences To Attend In 2025 - oxfordabstracts.com", + "url": "https://oxfordabstracts.com/blog/top-ai-conferences-to-attend/" + }, + { + "title": "The Best Artificial Intelligence Conferences & Events of 2025", + "url": "https://www.splunk.com/en_us/blog/learn/ai-artificial-intelligence-conferences-events.html" + }, + { + "title": "The Future of AI: Top Global Conferences to Watch from 2025 to 2026", + "url": "https://dscnextcon.com/the-future-of-ai-top-global-conferences-to-watch-from-2025-to-2026/" + }, + { + "title": "2025 MIT AI Conference | ILP", + "url": "https://ilp.mit.edu/AI25" + }, + { + "title": "IAIC2024", + "url": "http://2024.iaicconf.com/" + }, + { + "title": "Association for the Advancement of Artificial Intelligence AAAI Annual ...", + "url": "https://www.paconvention.com/events/detail/association-for-the-advancement-of-artificial-intelligence-aaai-annual-conference-25016" + }, + { + "title": "Artificial Intelligence Conference | Machine Learning Meetings ...", + "url": "https://cognitionconferences.com/artificialintelligence/" + }, + { + "title": "Major AI Events 2025 - Airmeet", + "url": "https://www.airmeet.com/hub/blog/major-ai-events/" + }, + { + "title": "The Biggest AI Conferences & Events Happening in 2025", + "url": "https://engine.com/business-travel-guide/biggest-ai-expos-events-conferences-2025" + }, + { + "title": "AI Conferences 2025 : Agenda + List of Events and Exhibitions", + "url": "https://www.aixploria.com/en/worldwide-top-ai-conferences-agenda/" + }, + { + "title": "Top Artificial Intelligence Conferences 2025", + "url": "https://academicworldresearch.org/blog/top-artificial-intelligence-conferences/" + } + ] + } +] \ No newline at end of file diff --git a/src/mmore/__main__.py b/src/mmore/__main__.py index 5469b6e0..1f19fa6d 100644 --- a/src/mmore/__main__.py +++ b/src/mmore/__main__.py @@ -1,10 +1,60 @@ import click - +import argparse +import yaml + +from contextlib import contextmanager +import warnings +import sys +import os + +from mmore.websearch.pipeline import WebsearchPipeline +from mmore.websearch.config import WebsearchConfig + +# @contextmanager +# def suppress_warnings_and_stdout(): +# # Suppress specific warnings +# warnings.filterwarnings('ignore', category=FutureWarning, message="The input name `inputs` is deprecated*") +# warnings.filterwarnings('ignore', category=FutureWarning, message="*UserWarning:*") + +# pypdfium_message = "-> Cannot close object, library is destroyed. This may cause a memory leak!*" +# # Redirect stdout to devnull to catch pypdfium messages +# old_stdout = sys.stdout +# devnull = open(os.devnull, 'w') +# # Suppress pypdfium warnings +# sys.stdout = devnull + +# try: +# sys.stdout = devnull +# yield + +# finally: +# sys.stdout = old_stdout +# devnull.close() + +# import logging +# logger = logging.getLogger(__name__) +# MMORE_EMOJI = "🐮" +# logging.basicConfig(format=f'[MMORE {MMORE_EMOJI} -- %(asctime)s] %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') @click.group() def main(): - """CLI for mmore commands.""" - pass + """Main entry point.""" + parser = argparse.ArgumentParser(description="MMORE command line interface") + subparsers = parser.add_subparsers(dest="command", help="Command to run") + + # Websearch command + websearch_parser = subparsers.add_parser("websearch", help="Run websearch pipeline") + websearch_parser.add_argument("--config-file", required=True, help="Path to config file") + + args = parser.parse_args() + + if args.command == "websearch": + with open(args.config_file, "r") as f: + config = yaml.safe_load(f) + pipeline = WebsearchPipeline(WebsearchConfig.from_dict(config)) + pipeline.run() + else: + parser.print_help() @main.command() @@ -92,5 +142,15 @@ def rag(config_file): run_rag(config_file) +@main.command() +@click.option('--config-file', type=str, required=True, help='Configuration file path.') +def websearch(config_file): + """Run the websearch pipeline.""" + from .run_websearch import run_websearch + with open(config_file, 'r') as f: + config = yaml.safe_load(f) + run_websearch(config) + + if __name__ == "__main__": main() diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py new file mode 100644 index 00000000..215a54fe --- /dev/null +++ b/src/mmore/run_websearch.py @@ -0,0 +1,8 @@ +"""Run the websearch pipeline.""" +from mmore.websearch.pipeline import WebsearchPipeline +from mmore.websearch.config import WebsearchConfig + +def run_websearch(config): + """Run the websearch pipeline.""" + pipeline = WebsearchPipeline(WebsearchConfig.from_dict(config)) + pipeline.run() \ No newline at end of file diff --git a/src/mmore/websearch/__init__.py b/src/mmore/websearch/__init__.py new file mode 100644 index 00000000..79863ad7 --- /dev/null +++ b/src/mmore/websearch/__init__.py @@ -0,0 +1,6 @@ +"""Websearch module for enhancing RAG outputs with web search.""" + +from .pipeline import WebsearchPipeline +from .config import WebsearchConfig + +__all__ = ["WebsearchPipeline", "WebsearchConfig"] \ No newline at end of file diff --git a/src/mmore/websearch/config.py b/src/mmore/websearch/config.py new file mode 100644 index 00000000..d56d22bd --- /dev/null +++ b/src/mmore/websearch/config.py @@ -0,0 +1,42 @@ +"""Configuration for websearch pipeline.""" +from dataclasses import dataclass +from typing import Dict, Any + +from .llm import LLMConfig + +@dataclass +class WebsearchConfig: + """Configuration for websearch pipeline.""" + input_file: str + output_file: str + n_loops: int = 2 + llm_name: str = "OpenMeditron/meditron3-8b" + max_searches: int = 10 + llm_config: Dict[str, Any] = None + + def __post_init__(self): + if self.llm_config is None: + self.llm_config = { + "max_new_tokens": 1000, + "temperature": 0.7 + } + + @classmethod + def from_dict(cls, config: Dict[str, Any]) -> "WebsearchConfig": + """Create config from dictionary.""" + return cls( + input_file=config["input_file"], + output_file=config["output_file"], + n_loops=config.get("n_loops", 2), + llm_name=config.get("llm_name", "OpenMeditron/meditron3-8b"), + max_searches=config.get("max_searches", 10), + llm_config=config.get("llm_config", None) + ) + + def get_llm_config(self) -> LLMConfig: + """Get LLM configuration.""" + return LLMConfig( + llm_name=self.llm_name, + max_new_tokens=self.llm_config["max_new_tokens"], + temperature=self.llm_config["temperature"] + ) \ No newline at end of file diff --git a/src/mmore/websearch/llm.py b/src/mmore/websearch/llm.py new file mode 100644 index 00000000..de8acfe1 --- /dev/null +++ b/src/mmore/websearch/llm.py @@ -0,0 +1,57 @@ +"""Simple LLM implementation for websearch.""" +from dataclasses import dataclass + +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch +from langchain_core.messages import BaseMessage + +@dataclass +class LLMConfig: + """Simple LLM configuration.""" + llm_name: str + max_new_tokens: int = 2000 + temperature: float = 0.2 + +class LLM: + """Simple LLM wrapper for websearch.""" + def __init__(self, model, tokenizer, config: LLMConfig): + self.model = model + self.tokenizer = tokenizer + self.config = config + + @classmethod + def from_config(cls, config: LLMConfig): + """Create LLM from configuration.""" + tokenizer = AutoTokenizer.from_pretrained(config.llm_name) + model = AutoModelForCausalLM.from_pretrained( + config.llm_name, + torch_dtype=torch.float16, + device_map="auto" + ) + return cls(model, tokenizer, config) + + def invoke(self, messages: list[BaseMessage]) -> BaseMessage: + """Generate response for the given messages.""" + # Convert messages to prompt + prompt = "" + for msg in messages: + if msg.type == "system": + prompt += f"System: {msg.content}\n" + elif msg.type == "human": + prompt += f"Human: {msg.content}\n" + elif msg.type == "assistant": + prompt += f"Assistant: {msg.content}\n" + prompt += "Assistant: " + + # Generate + inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) + outputs = self.model.generate( + **inputs, + max_new_tokens=self.config.max_new_tokens, + temperature=self.config.temperature, + do_sample=True, + pad_token_id=self.tokenizer.eos_token_id + ) + response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + + return BaseMessage(content=response, type="assistant") \ No newline at end of file diff --git a/src/mmore/websearch/pipeline.py b/src/mmore/websearch/pipeline.py new file mode 100644 index 00000000..8b073e1c --- /dev/null +++ b/src/mmore/websearch/pipeline.py @@ -0,0 +1,234 @@ +import json +import re +from pathlib import Path +from typing import Dict, List, Any, Optional, Set + +from duckduckgo_search import DDGS +from langchain_core.messages import HumanMessage, SystemMessage + +from .llm import LLM +from .config import WebsearchConfig + + +class WebsearchPipeline: + """Pipeline for enhancing RAG outputs with web search.""" + + def __init__(self, config: WebsearchConfig): + """Initialize pipeline.""" + self.config = config + self.llm = LLM.from_config(config.get_llm_config()) + + @staticmethod + def clean_llm_output(text: str) -> str: + """Remove internal model tokens.""" + return re.sub(r'<\|.*?\|>', '', text).strip() + + @staticmethod + def extract_llm_answer(raw_response: str) -> str: + """Extract the answer following the 'Answer:' prefix from the LLM response.""" + raw_response = WebsearchPipeline.clean_llm_output(raw_response) + match = re.search(r'Answer:(.*)', raw_response, re.DOTALL | re.IGNORECASE) + if match: + return match.group(1).strip() + return raw_response + + def generate_summary(self, rag_answer: str, query: str) -> str: + """Build the prompt, invoke the LLM, and extract the answer.""" + prompt = ( + "You have only the following context to answer the question—do not use any external knowledge.\n\n" + f"Question: {query}\n\n" + "Context:\n" + f"{rag_answer}\n\n" + "If the context contains the answer or useful information, respond with that information. " + "Answer:" + ) + + messages = [ + SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + return self.extract_llm_answer(response.content) + + @staticmethod + def extract_query_from_llm_output(text: str) -> str: + """Extract everything between tags, picking longest match.""" + matches = re.findall(r'<\s*question\s*>(.*?)<\s*/\s*question\s*>', text, re.DOTALL | re.IGNORECASE) + return max(matches, key=len).strip() if matches else "" + + @staticmethod + def validate_search_query(query: str, original_query: str) -> str: + """Fallback to default if empty or >30 words.""" + return query if query and len(query.split()) <= 30 else original_query + + @staticmethod + def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: + """Perform DuckDuckGo search.""" + try: + with DDGS() as ddgs: + return [ + {'title': r.get('title'), 'url': r.get('href')} + for r in ddgs.text(query, max_results=max_results) + ] + except Exception as e: + print(f"Search error: {e}") + return [] + + def generate_search_query(self, original_query: str, rag_answer: str, previous_analysis: Optional[str] = None) -> str: + """Generate and extract query enclosed in tags.""" + base = f"Based on:\n- Original Query: '{original_query}'\n- RAG Answer: '{rag_answer}'" + if previous_analysis: + base += f"\n- Previous Findings: {previous_analysis}" + prompt = ( + f"{base}\n" + "Generate a concise search query (up to 30 words) that either directly answers the original question " + "or complements previous findings by seeking missing or updated information. " + "Enclose your answer within tags." + ) + messages = [SystemMessage(content="You are a search query generator."), HumanMessage(content=prompt)] + response = self.llm.invoke(messages) + + raw = self.clean_llm_output(response.content) + extracted = self.extract_query_from_llm_output(raw) + return self.validate_search_query(' '.join(extracted.split()), original_query) + + def analyze_search_results(self, original_query: str, rag_answer: str, results: List[Dict[str, str]]) -> str: + """Analyze search results and create a combined summary with RAG and web information.""" + collated = '\n\n'.join([f"Source: {r['title']}\n{r.get('body', '')}" for r in results]) + prompt = ( + f"Original Query: {original_query}\n" + f"Current Knowledge (RAG): {rag_answer}\n" + "New Information from Web:\n" f"{collated}\n\n" + "Provide a detailed analysis that combines the RAG knowledge with the new web information. " + "Your response should:\n" + "1. Integrate both RAG and web information comprehensively\n" + "2. Include specific details, facts, and findings\n" + "3. Highlight important updates or corrections from the web sources\n" + "Structure your response as follows in the tags :\n" + "- Summary: A summary of all key points from the RAG and the web sources responding directly to the query\n" + "- More detailed informations: Any additional useful information more detailed\n" + "and outside of the tags:\n" + "ADDITIONAL GAPS:\n" + "- [List any remaining questions or areas needing more research in order to improve the answer of the original query]" + ) + messages = [ + SystemMessage(content="You are a research analyst focused on providing detailed, comprehensive analysis."), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + return self.clean_llm_output(response.content) + + @staticmethod + def extract_enhanced_answer(text: str) -> Optional[Dict[str, str]]: + """Extract the content between enhanced_answer tags, focusing on the last occurrence.""" + try: + pattern = r'\s*(.*?)\s*' + matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE) + + if not matches: + # fallback extraction if tags missing + pattern_sum = r'Summary:(.*?)(?:More detailed informations:|ADDITIONAL GAPS:|$)' + summary_matches = re.findall(pattern_sum, text, re.DOTALL | re.IGNORECASE) + + pattern_det = r'More detailed informations:(.*?)(?:ADDITIONAL GAPS:|$)' + details_matches = re.findall(pattern_det, text, re.DOTALL | re.IGNORECASE) + + if summary_matches or details_matches: + return { + 'summary': summary_matches[-1].strip() if summary_matches else "", + 'details': details_matches[-1].strip() if details_matches else "" + } + return None + + extracted_content = matches[-1].strip() + + summary_lines = [] + details_lines = [] + current_section = None + + for line in extracted_content.splitlines(): + line = line.strip() + if not line or line.startswith('Source:') or line.startswith('New Information'): + continue + + if line.lower().startswith('summary:'): + current_section = 'summary' + continue + elif line.lower().startswith('more detailed informations'): + current_section = 'details' + continue + + if current_section == 'summary': + summary_lines.append(line) + elif current_section == 'details': + details_lines.append(line) + + return { + 'summary': '\n'.join(summary_lines).strip(), + 'details': '\n'.join(details_lines).strip() + } + except Exception as e: + print(f"Warning: Error during extraction: {e}") + return None + + def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: + """Process a single JSON record with web search enhancement.""" + user_query = record.get('input', '') + initial_rag_answer = record.get('answer', '') + + # Generate initial summary + initial_summary = self.generate_summary(initial_rag_answer, user_query) + + previous_analysis = None + all_sources: List[Dict[str, str]] = [] + seen_urls: Set[str] = set() + current_knowledge = initial_summary + final_answer = None + + for i in range(1, self.config.n_loops + 1): + print(f"Loop {i} of {self.config.n_loops}") + query = self.generate_search_query(user_query, current_knowledge, previous_analysis) + print(f"Generated query: {query}, based on the original query: {user_query}") + + results = self.duckduckgo_search(query, self.config.max_searches) + + # Add only new sources (de-dup by url) + for r in results: + if r['url'] not in seen_urls: + all_sources.append({'title': r['title'], 'url': r['url']}) + seen_urls.add(r['url']) + + analysis = self.analyze_search_results(user_query, current_knowledge, results) + + extracted_answer = self.extract_enhanced_answer(analysis) + if extracted_answer: + current_knowledge = extracted_answer.get('summary', '') + '\n' + extracted_answer.get('details', '') + final_answer = extracted_answer + else: + # fallback to full analysis if no extraction + current_knowledge = analysis + + previous_analysis = analysis + + return { + 'query': user_query, + 'RAG_summary': initial_summary, + 'WEB_RAG_summary': final_answer.get('summary', '') if final_answer else "", + 'WEBSEARCH_details': final_answer.get('details', '') if final_answer else "", + 'sources': all_sources, + } + + def run(self): + """Run the websearch pipeline.""" + with open(self.config.input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + results = [self.process_record(record) for record in data] + + output_path = Path(self.config.output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + + print(f"\nResults saved to {output_path}") From 75a8b3ae3551ba9ffa9b20e43d3fbe8b97dd0be7 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Thu, 5 Jun 2025 13:07:39 -0700 Subject: [PATCH 02/33] websearch - half pipeline - not tested --- src/mmore/__main__.py | 74 ++----- src/mmore/run_websearch.py | 110 +++++++++- src/mmore/websearchRAG/config.py | 57 ++++++ src/mmore/websearchRAG/llm.py | 41 ++++ src/mmore/websearchRAG/pipeline.py | 312 +++++++++++++++++++++++++++++ 5 files changed, 529 insertions(+), 65 deletions(-) create mode 100644 src/mmore/websearchRAG/config.py create mode 100644 src/mmore/websearchRAG/llm.py create mode 100644 src/mmore/websearchRAG/pipeline.py diff --git a/src/mmore/__main__.py b/src/mmore/__main__.py index 1f19fa6d..182f8bc2 100644 --- a/src/mmore/__main__.py +++ b/src/mmore/__main__.py @@ -1,60 +1,12 @@ +# __main__.py + import click -import argparse import yaml -from contextlib import contextmanager -import warnings -import sys -import os - -from mmore.websearch.pipeline import WebsearchPipeline -from mmore.websearch.config import WebsearchConfig - -# @contextmanager -# def suppress_warnings_and_stdout(): -# # Suppress specific warnings -# warnings.filterwarnings('ignore', category=FutureWarning, message="The input name `inputs` is deprecated*") -# warnings.filterwarnings('ignore', category=FutureWarning, message="*UserWarning:*") - -# pypdfium_message = "-> Cannot close object, library is destroyed. This may cause a memory leak!*" -# # Redirect stdout to devnull to catch pypdfium messages -# old_stdout = sys.stdout -# devnull = open(os.devnull, 'w') -# # Suppress pypdfium warnings -# sys.stdout = devnull - -# try: -# sys.stdout = devnull -# yield - -# finally: -# sys.stdout = old_stdout -# devnull.close() - -# import logging -# logger = logging.getLogger(__name__) -# MMORE_EMOJI = "🐮" -# logging.basicConfig(format=f'[MMORE {MMORE_EMOJI} -- %(asctime)s] %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') - @click.group() def main(): - """Main entry point.""" - parser = argparse.ArgumentParser(description="MMORE command line interface") - subparsers = parser.add_subparsers(dest="command", help="Command to run") - - # Websearch command - websearch_parser = subparsers.add_parser("websearch", help="Run websearch pipeline") - websearch_parser.add_argument("--config-file", required=True, help="Path to config file") - - args = parser.parse_args() - - if args.command == "websearch": - with open(args.config_file, "r") as f: - config = yaml.safe_load(f) - pipeline = WebsearchPipeline(WebsearchConfig.from_dict(config)) - pipeline.run() - else: - parser.print_help() + """CLI for mmore commands.""" + pass @main.command() @@ -143,13 +95,21 @@ def rag(config_file): @main.command() -@click.option('--config-file', type=str, required=True, help='Configuration file path.') +@click.option( + "--config-file", + type=str, + required=True, + help="Path to the Websearch configuration file (YAML).", +) def websearch(config_file): - """Run the websearch pipeline.""" + """Run the Websearch (+ optional RAG) pipeline.""" from .run_websearch import run_websearch - with open(config_file, 'r') as f: - config = yaml.safe_load(f) - run_websearch(config) + + # Load your YAML and pass it into the runner + with open(config_file, "r") as f: + config_dict = yaml.safe_load(f) + + run_websearch(config_dict) if __name__ == "__main__": diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 215a54fe..3f28d085 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -1,8 +1,102 @@ -"""Run the websearch pipeline.""" -from mmore.websearch.pipeline import WebsearchPipeline -from mmore.websearch.config import WebsearchConfig - -def run_websearch(config): - """Run the websearch pipeline.""" - pipeline = WebsearchPipeline(WebsearchConfig.from_dict(config)) - pipeline.run() \ No newline at end of file +# mmore/run_websearch.py + +import argparse +import logging +import os +import time +from dataclasses import dataclass +from typing import Any, Dict + +import torch + +from .websearch.config import WebsearchConfig +from .websearch.pipeline import WebsearchPipeline +from .utils import load_config + +WEBSRCH_EMOJI = "🌐" +logger = logging.getLogger(__name__) +logging.basicConfig( + format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) + +# Disable certain CUDA optimizations for stability (same as in run_process.py) +torch.backends.cuda.enable_mem_efficient_sdp(False) +torch.backends.cuda.enable_flash_sdp(False) +torch.backends.cuda.enable_math_sdp(True) + + +@dataclass +class WebsearchInference: + """ + Inference configuration for Websearch (+ optional RAG) pipeline. + This should mirror the keys under your top-level 'websearch:' in the YAML. + """ + rag_config_path: str # path to RAG config (e.g. "../rag/config.yaml") + use_rag: bool # whether to run RAG first + rag_summary: bool # whether to summarize the RAG answer + input_file: str # path to JSON input (either queries or RAG output) + output_file: str # path to write final JSON results + n_subqueries: int # number of sub-queries to generate + max_searches: int # max DuckDuckGo hits per sub-query + llm_config: Dict[str, Any] # passed into LLMConfig (llm_name, max_new_tokens, temperature, etc.) + + # If you add more fields under `websearch:` in your YAML, add them here. + + +def run_websearch(config_file: str): + """ + Run the Websearch (+ optional RAG) pipeline according to the YAML at config_file. + + 1) Load WebsearchInference via load_config. + 2) Convert that to a WebsearchConfig. + 3) Instantiate and run WebsearchPipeline. + """ + click_msg = f"Websearch configuration file: {config_file}" + logger.info(click_msg) + + start_time = time.time() + + # 1) Load the YAML into our dataclass + cfg: WebsearchInference = load_config(config_file, WebsearchInference) + + # 2) Build WebsearchConfig from our dataclass + # WebsearchConfig.from_dict expects a dict with exactly the same keys, so: + web_cfg = WebsearchConfig.from_dict({ + "rag_config_path": cfg.rag_config_path, + "use_rag": cfg.use_rag, + "rag_summary": cfg.rag_summary, + "input_file": cfg.input_file, + "output_file": cfg.output_file, + "n_subqueries": cfg.n_subqueries, + "max_searches": cfg.max_searches, + "llm_config": cfg.llm_config, + }) + + logger.info(f"Using WebsearchConfig: {web_cfg}") + + # 3) Instantiate and run + pipeline = WebsearchPipeline(config=web_cfg) + + pipeline_start = time.time() + pipeline.run() + pipeline_end = time.time() + + elapsed = pipeline_end - pipeline_start + logger.info(f"Websearch pipeline completed in {elapsed:.2f} seconds.") + + total_elapsed = time.time() - start_time + logger.info(f"Total execution time: {total_elapsed:.2f} seconds.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the Websearch (+ optional RAG) pipeline.") + parser.add_argument( + "--config-file", + type=str, + required=True, + help="Path to the Websearch configuration file (YAML)." + ) + args = parser.parse_args() + run_websearch(args.config_file) diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py new file mode 100644 index 00000000..3ef374c7 --- /dev/null +++ b/src/mmore/websearchRAG/config.py @@ -0,0 +1,57 @@ +# mmore/websearch/config.py + +from dataclasses import dataclass +from typing import Any, Dict + +from rag.llm import LLMConfig # Reuse the same LLMConfig as RAG + + +@dataclass +class WebsearchConfig: + """ + Configuration for WebsearchPipeline. + + Fields: + rag_config_path: (str or None) Path to the RAG config YAML. Required if use_rag=True. + use_rag: (bool) If True, run RAG first; otherwise skip directly to sub-query generation. + rag_summary: (bool) If True, run an initial LLM-based summary of the RAG answer. + input_file: (str) Path to the JSON file used as “queries” (or RAG output). + output_file: (str) Path where the enhanced JSON results will be written. + n_subqueries: (int) Number of sub-queries to generate via LLM. + max_searches: (int) Max results to fetch from DuckDuckGo per sub-query. + llm_config: (dict) Passed to rag.llm.LLMConfig (keys: llm_name, max_new_tokens, temperature, etc.) + """ + + rag_config_path: str # e.g. "../rag/config.yaml" + use_rag: bool + rag_summary: bool + input_file: str + output_file: str + n_subqueries: int + max_searches: int + llm_config: Dict[str, Any] + + @staticmethod + def from_dict(d: Dict[str, Any]) -> "WebsearchConfig": + # Validate required keys + required = ["use_rag", "rag_summary", "input_file", "output_file", "n_subqueries", "max_searches", "llm_config"] + for key in required: + if key not in d: + raise ValueError(f"Missing '{key}' in WebsearchConfig.") + rag_config_path = d.get("rag_config_path", "") + return WebsearchConfig( + rag_config_path=rag_config_path, + use_rag=d["use_rag"], + rag_summary=d["rag_summary"], + input_file=d["input_file"], + output_file=d["output_file"], + n_subqueries=int(d["n_subqueries"]), + max_searches=int(d["max_searches"]), + llm_config=d["llm_config"], + ) + + def get_llm_config(self) -> LLMConfig: + """ + Convert the nested llm_config dict into an instance of rag.llm.LLMConfig. + """ + return LLMConfig(**self.llm_config) diff --git a/src/mmore/websearchRAG/llm.py b/src/mmore/websearchRAG/llm.py new file mode 100644 index 00000000..7e5344db --- /dev/null +++ b/src/mmore/websearchRAG/llm.py @@ -0,0 +1,41 @@ + + + + + +def generate_query_scratch(query): + return "query" + + +def resume_from_RAG(query, RAG_output): + return "query" + + +def define_n_queries_based_on_RAG(query, RAG_output): + return "query" + + +def resume_web_search(query, web_search_output): + return "query" + + +def resume_all_web_search(query, web_search_output): + return "query" + + +def resume_all_web_search_and_RAG_summary(query, RAG_output_summary, web_search_output): + return "query" + +def resume_all_web_search_and_RAG_full(query, RAG_output_summary, web_search_output): + return "query" + + +def keep_only_the_most_relevant_web_search(query, web_search_output): + return "query" + + + +def provide_final_answer(): + return answer_short, answer_long + + diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py new file mode 100644 index 00000000..27819a0f --- /dev/null +++ b/src/mmore/websearchRAG/pipeline.py @@ -0,0 +1,312 @@ +# mmore/websearch/pipeline.py + +import json +import re +from pathlib import Path +from typing import Dict, List, Any, Optional, Set + +from duckduckgo_search import DDGS +from langchain_core.messages import HumanMessage, SystemMessage + +from rag.run_rag import run_rag +from rag.utils import read_queries +from rag.llm import LLM +from .config import WebsearchConfig + + +class WebsearchPipeline: + """ + Pipeline for optionally running RAG first, then generating sub-queries, + performing web searches, and finally integrating everything via the LLM. + """ + + def __init__(self, config: WebsearchConfig): + self.config = config + # Initialize the LLM using the same LLMConfig as RAG + self.llm = LLM.from_config(config.get_llm_config()) + + # Will store RAG results if we run RAG + self.rag_results: Optional[List[Dict[str, Any]]] = None + + @staticmethod + def clean_llm_output(text: str) -> str: + """Remove internal tokens or delimiters from the LLM output.""" + return re.sub(r'<\|.*?\|>', '', text).strip() + + @staticmethod + def extract_answer_after_prefix(raw: str, prefix: str) -> str: + """ + Extract everything after a given prefix (case-insensitive). + If prefix not found, return raw. + """ + raw = WebsearchPipeline.clean_llm_output(raw) + pattern = re.compile(re.escape(prefix) + r"(.*)", re.IGNORECASE | re.DOTALL) + match = pattern.search(raw) + return match.group(1).strip() if match else raw + + @staticmethod + def is_useful_rag_answer(rag_answer: str) -> bool: + """ + Decide whether the RAG answer is “useful.” We treat nonempty answers + that do not just say “I don’t know” as useful. + """ + ans = rag_answer.strip().lower() + if not ans: + return False + # If it begins with a phrase like “i don’t know” or “no answer” etc. + if ans.startswith("i don’t know") or ans.startswith("i dont know") or ans.startswith("no"): + return False + return True + + def generate_subqueries( + self, + original_query: str, + rag_answer: Optional[str] = None + ) -> List[str]: + """ + Ask the LLM to produce exactly self.config.n_subqueries follow‐up search queries. + - If rag_answer is provided and “useful,” we prompt for subqueries that complement it. + - If rag_answer is None or not useful, we prompt for subqueries that answer the original query fully. + We expect the LLM to output one sub-query per line. + """ + n = self.config.n_subqueries + + if rag_answer and self.is_useful_rag_answer(rag_answer): + # Build a prompt that asks for sub-queries to complement the existing RAG answer + prompt = ( + f"Original Query: \"{original_query}\"\n" + f"Current RAG Answer: \"{rag_answer}\"\n\n" + f"Generate {n} concise search queries (each ≤30 words) that would help update or fill gaps " + "in the RAG answer. Provide exactly one query per line. " + "Do not provide any additional commentary." + ) + else: + # No useful RAG answer: ask for sub-queries that collectively answer the original question + prompt = ( + f"Original Query: \"{original_query}\"\n\n" + f"No RAG knowledge is available. Generate {n} concise search queries " + "(each ≤30 words) that, when searched, would collectively answer the original query. " + "Provide exactly one query per line. Do not provide any additional commentary." + ) + + messages = [ + SystemMessage(content="You are a helpful assistant that generates search queries."), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + raw = self.clean_llm_output(response.content) + + # Split lines, discard empty lines, strip numbering if present + lines = [line.strip() for line in raw.splitlines() if line.strip()] + subqueries: List[str] = [] + for line in lines: + # Remove leading numbering like “1. …” or “(1) …” + cleaned = re.sub(r'^\s*\d+[\).\s]+\s*', '', line) + subqueries.append(cleaned) + if len(subqueries) >= n: + break + + # If LLM gave fewer lines, we can pad with the original query + while len(subqueries) < n: + subqueries.append(original_query) + + return subqueries + + @staticmethod + def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: + """ + Perform a DuckDuckGo search and return a list of {'title', 'url'} dicts. + """ + try: + with DDGS() as ddgs: + return [ + {"title": r.get("title", ""), "url": r.get("href", "")} + for r in ddgs.text(query, max_results=max_results) + ] + except Exception as e: + print(f"Search error for \"{query}\": {e}") + return [] + + def integrate_with_web( + self, + original_query: str, + base_knowledge: str, + all_results: List[Dict[str, str]] + ) -> str: + """ + Given: + - original_query (str), + - base_knowledge (either a non‐empty RAG answer or just an empty string), + - all_results: list of {title, url} from all sub‐queries, + produce a final “enhanced answer” via the LLM. + We’ll ask the LLM to weave base_knowledge + new web info into a single answer. + We’ll ask it to output the answer enclosed in tags. + """ + # Collate sources into a single text blob. (We omit “body” because DuckDuckGo API + # only supplies title & URL. You could extend to fetch page snippets if you like.) + collated = "\n".join([f"Source: {r['title']}, URL: {r['url']}" for r in all_results]) + + if base_knowledge: + prompt = ( + f"Original Query: \"{original_query}\"\n\n" + f"Current Knowledge (RAG): \"{base_knowledge}\"\n\n" + f"New Information from Web:\n{collated}\n\n" + "Produce a comprehensive answer that integrates the RAG knowledge with the new web information. " + "Your final answer must be enclosed in tags. " + "If there are any remaining gaps, list them under “ADDITIONAL GAPS” after the tags." + ) + else: + prompt = ( + f"Original Query: \"{original_query}\"\n\n" + f"No prior RAG knowledge. New Information from Web:\n{collated}\n\n" + "Provide a comprehensive answer that directly addresses the original query, " + "based solely on these web results. Your final answer must be enclosed in tags. " + "If there are any remaining gaps, list them under “ADDITIONAL GAPS” after the tags." + ) + + messages = [ + SystemMessage(content="You are a research analyst that writes detailed answers."), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + return self.clean_llm_output(response.content) + + @staticmethod + def extract_enhanced_answer(text: str) -> Dict[str, Any]: + """ + Extract the content between , and any “ADDITIONAL GAPS” afterwards. + Returns a dict: + { + "answer": , + "additional_gaps": + } + """ + # 1) Extract between tags + match = re.search(r"\s*(.*?)\s*", text, re.DOTALL | re.IGNORECASE) + answer = match.group(1).strip() if match else text.strip() + + # 2) Extract “ADDITIONAL GAPS:” block if present + gaps = "" + gap_match = re.search(r"ADDITIONAL GAPS:\s*(.*)$", text, re.DOTALL | re.IGNORECASE) + if gap_match: + gaps = gap_match.group(1).strip() + + return {"answer": answer, "additional_gaps": gaps} + + def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single record (dict). The record must contain at least: + - "input": the original query (str) + - If use_rag=True, "answer": the RAG answer (str) for that query + (The RAG pipeline’s output JSON is expected to have those keys.) + + Returns a dict with: + - "query" (original input) + - "rag_answer" (empty string if use_rag=False) + - "generated_subqueries": [list of sub-queries] + - "enhanced_answer": the final answer (text inside ) + - "additional_gaps": any remaining gaps (empty str if none) + - "sources": [ { "title": ..., "url": ... } ] (all unique sources gathered) + """ + user_query = record.get("input", "").strip() + rag_answer = record.get("answer", "").strip() if self.config.use_rag else "" + + # Decide if we have “useful” RAG info + useful = False + if self.config.use_rag and self.is_useful_rag_answer(rag_answer): + useful = True + + # Step 1: Optionally run a summary on the RAG answer if requested + if useful and self.config.rag_summary: + rag_answer = self.generate_summary(rag_answer, user_query) + + # Step 2: Generate sub-queries + subqueries = self.generate_subqueries(user_query, rag_answer if useful else None) + + # Step 3: For each sub-query, run DuckDuckGo search and collect sources + all_sources: List[Dict[str, str]] = [] + seen_urls: Set[str] = set() + for subq in subqueries: + results = self.duckduckgo_search(subq, max_results=self.config.max_searches) + for r in results: + if r["url"] not in seen_urls: + all_sources.append(r) + seen_urls.add(r["url"]) + + # Step 4: Integrate RAG answer (if any) or just original query with all web results + base_knowledge = rag_answer if useful else "" + combined = self.integrate_with_web(user_query, base_knowledge, all_sources) + + extracted = self.extract_enhanced_answer(combined) + final_answer = extracted["answer"] + final_gaps = extracted["additional_gaps"] + + return { + "query": user_query, + "rag_answer": rag_answer, + "generated_subqueries": subqueries, + "enhanced_answer": final_answer, + "additional_gaps": final_gaps, + "sources": all_sources, + } + + def run(self): + """ + 1) If use_rag=True: invoke run_rag(...) to produce RAG output JSON. + 2) Read self.config.input_file (either original queries or RAG output) as a JSON array. + 3) For each record, call process_record(...) and collect outputs. + 4) Write all outputs to self.config.output_file as JSON array. + """ + # If use_rag=True, run the RAG pipeline first + if self.config.use_rag: + if not self.config.rag_config_path: + raise ValueError("rag_config_path is required when use_rag=True.") + + print(f"Running RAG pipeline with config: {self.config.rag_config_path}") + # We expect run_rag(...) to return a List[Dict] if return_results=True. + self.rag_results = run_rag(self.config.rag_config_path, return_results=True) + print("RAG pipeline completed.") + + # The RAG pipeline also writes its own JSON to rag_config.mode_args.output_file. + # We assume that output path equals self.config.input_file at this point. + # (In practice, you should set self.config.input_file to that path in your YAML.) + else: + self.rag_results = None + + # Step 2: Load the JSON to process. It should be a list of records. + with open(self.config.input_file, "r", encoding="utf-8") as f: + data: List[Dict[str, Any]] = json.load(f) + + # Step 3: Process each record + all_outputs: List[Dict[str, Any]] = [] + for record in data: + out = self.process_record(record) + all_outputs.append(out) + + # Step 4: Save results + out_path = Path(self.config.output_file) + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w", encoding="utf-8") as out_f: + json.dump(all_outputs, out_f, indent=2, ensure_ascii=False) + + print(f"\nResults saved to {out_path}") + + def generate_summary(self, rag_answer: str, query: str) -> str: + """ + Summarize the RAG answer (used when rag_summary=True). + """ + prompt = ( + "You have only the following context to answer the question—do not use any external knowledge.\n\n" + f"Question: {query}\n\n" + "Context:\n" + f"{rag_answer}\n\n" + "If the context contains the answer or useful information, respond with that information. " + "Answer:" + ) + + messages = [ + SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + return self.extract_answer_after_prefix(response.content, "Answer:") From 22009aaead55812c2744c4fb6fad929863e71be2 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 9 Jun 2025 13:24:59 -0700 Subject: [PATCH 03/33] correct RAG import - LLM HS --- examples/websearchRAG/config.yaml | 12 ++++ src/mmore/run_websearch.py | 94 ++++++++++++----------------- src/mmore/websearchRAG/config.py | 27 ++++++++- src/mmore/websearchRAG/pipeline.py | 65 ++++++++++++++++---- src/mmore/websearchRAG/websearch.py | 68 +++++++++++++++++++++ 5 files changed, 199 insertions(+), 67 deletions(-) create mode 100644 examples/websearchRAG/config.yaml create mode 100644 src/mmore/websearchRAG/websearch.py diff --git a/examples/websearchRAG/config.yaml b/examples/websearchRAG/config.yaml new file mode 100644 index 00000000..e9c90c5d --- /dev/null +++ b/examples/websearchRAG/config.yaml @@ -0,0 +1,12 @@ +websearch: + use_rag: true + rag_config_path: examples/rag/config.yaml + use_summary: false + input_file: examples/rag/output.json + input_queries: examples/rag/queries.jsonl + output_file: examples/websearchRAG/enhanced_results_trial.json + n_loops: 2 + max_searches: 10 + llm_config: + temperature: 0.2 + llm: OpenMeditron/meditron3-8b diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 3f28d085..38df1e42 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -2,16 +2,14 @@ import argparse import logging -import os import time +import torch from dataclasses import dataclass from typing import Any, Dict -import torch - -from .websearch.config import WebsearchConfig -from .websearch.pipeline import WebsearchPipeline from .utils import load_config +from .websearchRAG.config import WebsearchConfig +from .websearchRAG.pipeline import WebsearchPipeline WEBSRCH_EMOJI = "🌐" logger = logging.getLogger(__name__) @@ -21,73 +19,61 @@ datefmt="%Y-%m-%d %H:%M:%S", ) -# Disable certain CUDA optimizations for stability (same as in run_process.py) +# CUDA tweaks (as before) torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_math_sdp(True) @dataclass -class WebsearchInference: - """ - Inference configuration for Websearch (+ optional RAG) pipeline. - This should mirror the keys under your top-level 'websearch:' in the YAML. - """ - rag_config_path: str # path to RAG config (e.g. "../rag/config.yaml") - use_rag: bool # whether to run RAG first - rag_summary: bool # whether to summarize the RAG answer - input_file: str # path to JSON input (either queries or RAG output) - output_file: str # path to write final JSON results - n_subqueries: int # number of sub-queries to generate - max_searches: int # max DuckDuckGo hits per sub-query - llm_config: Dict[str, Any] # passed into LLMConfig (llm_name, max_new_tokens, temperature, etc.) +class WebsearchSection: + use_rag: bool + rag_config_path: str + use_summary: bool + input_file: str + input_queries: str + output_file: str + n_loops: int + max_searches: int + llm_config: Dict[str, Any] + - # If you add more fields under `websearch:` in your YAML, add them here. +@dataclass +class WebsearchAppConfig: + websearch: WebsearchSection def run_websearch(config_file: str): """ Run the Websearch (+ optional RAG) pipeline according to the YAML at config_file. - - 1) Load WebsearchInference via load_config. - 2) Convert that to a WebsearchConfig. - 3) Instantiate and run WebsearchPipeline. """ - click_msg = f"Websearch configuration file: {config_file}" - logger.info(click_msg) - - start_time = time.time() - - # 1) Load the YAML into our dataclass - cfg: WebsearchInference = load_config(config_file, WebsearchInference) - - # 2) Build WebsearchConfig from our dataclass - # WebsearchConfig.from_dict expects a dict with exactly the same keys, so: - web_cfg = WebsearchConfig.from_dict({ - "rag_config_path": cfg.rag_config_path, - "use_rag": cfg.use_rag, - "rag_summary": cfg.rag_summary, - "input_file": cfg.input_file, - "output_file": cfg.output_file, - "n_subqueries": cfg.n_subqueries, - "max_searches": cfg.max_searches, - "llm_config": cfg.llm_config, - }) - + logger.info(f"Websearch configuration file: {config_file}") + + # 1) Load and parse the YAML using the wrapper + app_cfg = load_config(config_file, WebsearchAppConfig) + ws = app_cfg.websearch + logger.info(f"Parsed Websearch section: {ws}") + + # 2) Map to the pipeline's config dict + web_cfg_dict = { + "use_rag": ws.use_rag, + "rag_config_path": ws.rag_config_path, + "rag_summary": ws.use_summary, + "input_file": ws.input_file, + "input_queries": ws.input_queries, + "output_file": ws.output_file, + "n_subqueries": ws.n_loops, + "max_searches": ws.max_searches, + "llm_config": ws.llm_config, + } + web_cfg = WebsearchConfig.from_dict(web_cfg_dict) logger.info(f"Using WebsearchConfig: {web_cfg}") # 3) Instantiate and run pipeline = WebsearchPipeline(config=web_cfg) - - pipeline_start = time.time() + start = time.time() pipeline.run() - pipeline_end = time.time() - - elapsed = pipeline_end - pipeline_start - logger.info(f"Websearch pipeline completed in {elapsed:.2f} seconds.") - - total_elapsed = time.time() - start_time - logger.info(f"Total execution time: {total_elapsed:.2f} seconds.") + logger.info(f"Websearch pipeline completed in {time.time() - start:.2f} seconds.") if __name__ == "__main__": diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index 3ef374c7..a0802385 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -2,8 +2,10 @@ from dataclasses import dataclass from typing import Any, Dict +from pathlib import Path +import yaml -from rag.llm import LLMConfig # Reuse the same LLMConfig as RAG +from ..rag.llm import LLMConfig # Reuse the same LLMConfig as RAG @dataclass @@ -55,3 +57,26 @@ def get_llm_config(self) -> LLMConfig: Convert the nested llm_config dict into an instance of rag.llm.LLMConfig. """ return LLMConfig(**self.llm_config) + + + def access_rag_config(self) -> Dict[str, Any]: + """ + Access and parse the RAG configuration file defined in `rag_config_path`. + + Returns: + A dictionary representing the RAG configuration. + """ + if not self.rag_config_path: + raise ValueError("The 'rag_config_path' is not defined.") + + # Resolve the full path to the RAG config file + rag_config_full_path = Path(self.rag_config_path) + + if not rag_config_full_path.exists(): + raise FileNotFoundError(f"RAG config file not found at {rag_config_full_path}") + + # Load the RAG configuration + with open(rag_config_full_path, "r") as file: + rag_config = yaml.safe_load(file) + + return rag_config \ No newline at end of file diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 27819a0f..3c87920d 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -4,16 +4,16 @@ import re from pathlib import Path from typing import Dict, List, Any, Optional, Set +import logging from duckduckgo_search import DDGS from langchain_core.messages import HumanMessage, SystemMessage -from rag.run_rag import run_rag -from rag.utils import read_queries -from rag.llm import LLM +from ..run_rag import rag +from ..run_rag import read_queries +from ..rag.llm import LLM, LLMConfig from .config import WebsearchConfig - class WebsearchPipeline: """ Pipeline for optionally running RAG first, then generating sub-queries, @@ -22,12 +22,33 @@ class WebsearchPipeline: def __init__(self, config: WebsearchConfig): self.config = config - # Initialize the LLM using the same LLMConfig as RAG - self.llm = LLM.from_config(config.get_llm_config()) + + # Initialize the LLM + self.llm = self._initialize_llm() # Will store RAG results if we run RAG self.rag_results: Optional[List[Dict[str, Any]]] = None + + + def _initialize_llm(self) -> LLM: + """ + Initialize the LLM using the configuration. + + If RAG is enabled, load and use the LLM configuration from the RAG config file. + Otherwise, use the default configuration from WebsearchConfig. + """ + if self.config.use_rag: + rag_config = self.config.access_rag_config() + llm_config_dict = rag_config.get("rag", {}).get("llm", None) + if llm_config_dict is None: + raise ValueError("Missing 'llm' config under 'rag' key in the RAG configuration.") + llm_config = LLMConfig(**llm_config_dict) + return LLM.from_config(llm_config) + else: + llm_config = self.config.get_llm_config() + return LLM.from_config(llm_config) + @staticmethod def clean_llm_output(text: str) -> str: """Remove internal tokens or delimiters from the LLM output.""" @@ -124,7 +145,7 @@ def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]] for r in ddgs.text(query, max_results=max_results) ] except Exception as e: - print(f"Search error for \"{query}\": {e}") + logger.info(f"Search error for \"{query}\": {e}") return [] def integrate_with_web( @@ -262,34 +283,54 @@ def run(self): if not self.config.rag_config_path: raise ValueError("rag_config_path is required when use_rag=True.") - print(f"Running RAG pipeline with config: {self.config.rag_config_path}") + logger.info(f"Running RAG pipeline with config: {self.config.rag_config_path}") # We expect run_rag(...) to return a List[Dict] if return_results=True. - self.rag_results = run_rag(self.config.rag_config_path, return_results=True) - print("RAG pipeline completed.") + rag(self.config.rag_config_path) + logger.info("RAG pipeline completed.") + + rag_cfg = self.config.access_rag_config() + output_file = rag_cfg["mode_args"]["output_file"] + self.config.input_file = output_file + + logger.info("Updated input file for the pipeline") + + #resume RAG also + # The RAG pipeline also writes its own JSON to rag_config.mode_args.output_file. # We assume that output path equals self.config.input_file at this point. # (In practice, you should set self.config.input_file to that path in your YAML.) else: self.rag_results = None + #no rag --> generate queries + + + + #Step 2: + + # Step 2: Load the JSON to process. It should be a list of records. with open(self.config.input_file, "r", encoding="utf-8") as f: data: List[Dict[str, Any]] = json.load(f) # Step 3: Process each record - all_outputs: List[Dict[str, Any]] = [] + all_outputs: List[Dict[str, Any]] = [] for record in data: out = self.process_record(record) all_outputs.append(out) + + # Step 4: Save results out_path = Path(self.config.output_file) out_path.parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", encoding="utf-8") as out_f: json.dump(all_outputs, out_f, indent=2, ensure_ascii=False) - print(f"\nResults saved to {out_path}") + logger.info(f"\nResults saved to {out_path}") + + def generate_summary(self, rag_answer: str, query: str) -> str: """ diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py new file mode 100644 index 00000000..6c6667ce --- /dev/null +++ b/src/mmore/websearchRAG/websearch.py @@ -0,0 +1,68 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Dict + +class WebsearchOnly: + """Class dedicated to performing web searches and validating their usefulness.""" + + def __init__(self, region: str = 'wt-wt', max_results: int = 10): + """Initialize the WebsearchOnly class with search parameters.""" + self.wrapper = DuckDuckGoSearchAPIWrapper(region=region, max_results=max_results) + + def websearch_pipeline(self, query: str) -> Dict[str, str]: + """Perform a single web search.""" + search = DuckDuckGoSearchResults(api_wrapper=self.wrapper) + web_output = search.run(query) + return web_output + + + + def multiple_queries(self, original_query: str, list_of_queries: List[str], keep_intact: bool) -> List[Dict[str, str]]: + """Perform multiple web searches in parallel for a list of queries.""" + results = [] + with ThreadPoolExecutor() as executor: + future_to_query = {executor.submit(self.websearch_pipeline, query): query for query in list_of_queries} + for future in as_completed(future_to_query): + query = future_to_query[future] + try: + result = future.result() + if keep_intact: + to_keep = self.check_source_of_web_search(original_query, result) + if to_keep: + results.append({'query': query, 'result': result, 'summary' : None}) + else: + summary = self.resume_web_search(original_query, result) + results.append({'query': query, 'result': result, 'summary': summary}) + except Exception as e: + results.append({'query': query, 'error': str(e)}) + + return results + + + + def check_source_of_web_search(self, query: str, web_output: str) -> bool: + """Call LLM to determine if the current web output is useful based on the original query.""" + llm = LLM() # TODO: Implement LLM + prompt = ( + f"Original Query: '{query}'\n" + f"Web Output: '{web_output}'\n" + "Is the web output useful for the original query? Answer with 'True' or 'False'." + ) + response = llm.invoke(prompt) + return response.strip().lower() == 'true' + + def resume_web_search(self, query: str, web_output: str) -> str: + """Call LLM to resume the current web output based on the original query, return a summary of the web search and the source.""" + llm = LLM() # TODO: Implement LLM + prompt = ( + f"Original Query: '{query}'\n" + f"Web content: '{web_output}'\n" + "Based on the original query and the web content, can you provide a response to the original query?" + ) + response = llm.invoke(prompt) + return response.strip() ### RAJOUTER LA SOURCE + +# Example usage: +# websearch = WebsearchOnly() +# results = websearch.multiple_queries("original_query", ["query1", "query2", "query3"], keep_intact=True) +# for res in results: +# print(res) From cbb7c8c64102b1bde7368683f8b78ff3dfc991ac Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Tue, 10 Jun 2025 14:11:39 -0700 Subject: [PATCH 04/33] extract new subqueries --- src/mmore/run_websearch.py | 19 ++- src/mmore/websearchRAG/logging_config.py | 24 +++ src/mmore/websearchRAG/pipeline.py | 181 +++++++++++++---------- 3 files changed, 135 insertions(+), 89 deletions(-) create mode 100644 src/mmore/websearchRAG/logging_config.py diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 38df1e42..fb248174 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -7,17 +7,22 @@ from dataclasses import dataclass from typing import Any, Dict + +from .websearchRAG.logging_config import logger # Import the shared logger + from .utils import load_config from .websearchRAG.config import WebsearchConfig from .websearchRAG.pipeline import WebsearchPipeline -WEBSRCH_EMOJI = "🌐" -logger = logging.getLogger(__name__) -logging.basicConfig( - format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) + + +# WEBSRCH_EMOJI = "🌐" +# logger = logging.getLogger(__name__) +# logging.basicConfig( +# format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", +# level=logging.INFO, +# datefmt="%Y-%m-%d %H:%M:%S", +# ) # CUDA tweaks (as before) torch.backends.cuda.enable_mem_efficient_sdp(False) diff --git a/src/mmore/websearchRAG/logging_config.py b/src/mmore/websearchRAG/logging_config.py new file mode 100644 index 00000000..ac9f7f0c --- /dev/null +++ b/src/mmore/websearchRAG/logging_config.py @@ -0,0 +1,24 @@ +import logging + +# Create or get the logger + +WEBSRCH_EMOJI = "🌐" +logger = logging.getLogger("WEBSEARCHRAG") +logging.basicConfig( + format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) + +# Prevent multiple handlers if the logger is configured multiple times +if not logger.handlers: + # Create a file handler to log to a file + file_handler = logging.FileHandler('shared_log_file.log') + file_handler.setLevel(logging.DEBUG) + + # Define log format + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + + # Add file handler to logger + logger.addHandler(file_handler) \ No newline at end of file diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 3c87920d..0966be3b 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -1,19 +1,31 @@ # mmore/websearch/pipeline.py +from .logging_config import logger + import json import re from pathlib import Path from typing import Dict, List, Any, Optional, Set import logging +import json + from duckduckgo_search import DDGS from langchain_core.messages import HumanMessage, SystemMessage + + from ..run_rag import rag from ..run_rag import read_queries from ..rag.llm import LLM, LLMConfig from .config import WebsearchConfig + + +#python -m mmore websearch --config-file examples/websearchRAG/config.yaml + + + class WebsearchPipeline: """ Pipeline for optionally running RAG first, then generating sub-queries, @@ -33,10 +45,7 @@ def __init__(self, config: WebsearchConfig): def _initialize_llm(self) -> LLM: """ - Initialize the LLM using the configuration. - - If RAG is enabled, load and use the LLM configuration from the RAG config file. - Otherwise, use the default configuration from WebsearchConfig. + Initialize the LLM using the updated configuration for the fine-tuned multilingual model. """ if self.config.use_rag: rag_config = self.config.access_rag_config() @@ -47,13 +56,32 @@ def _initialize_llm(self) -> LLM: return LLM.from_config(llm_config) else: llm_config = self.config.get_llm_config() + # Ensure the model is configured for multilingual use + llm_config.language_support = "multilingual" + llm_config.model_name = "fine_tuned_multilingual_model" return LLM.from_config(llm_config) - @staticmethod - def clean_llm_output(text: str) -> str: - """Remove internal tokens or delimiters from the LLM output.""" - return re.sub(r'<\|.*?\|>', '', text).strip() + def clean_llm_output(self, content): + # Define the delimiter after which the subqueries are located + delimiter = "---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + + # Split the content based on the delimiter + if delimiter in content: + + subquery_section = content.split(delimiter, 1)[-1] + # Use regex to extract lines matching the subquery format + subquery_section = subquery_section.lower().strip() + subqueries = re.findall(r"subquery \d+: (.*)", subquery_section.strip()) + print(f"subquery_section:", subquery_section) + print('subquery outputs', subqueries) + return subqueries + else: + return [] + + + + @staticmethod def extract_answer_after_prefix(raw: str, prefix: str) -> str: """ @@ -85,53 +113,58 @@ def generate_subqueries( rag_answer: Optional[str] = None ) -> List[str]: """ - Ask the LLM to produce exactly self.config.n_subqueries follow‐up search queries. - - If rag_answer is provided and “useful,” we prompt for subqueries that complement it. - - If rag_answer is None or not useful, we prompt for subqueries that answer the original query fully. - We expect the LLM to output one sub-query per line. + Generate concise search subqueries using the fine-tuned multilingual model. """ n = self.config.n_subqueries + instruction = f"You have the question and partial answer below:\nQuestion: {original_query}\n\n" + task = ( + f"Partial RAG Answer: {rag_answer}\n\n" + f"Generate {n}-independant subqueries to refine the answer based on the original query. Each subquery must be concise and ≤30 words.\n" + f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" + "---------------------------" + ) - if rag_answer and self.is_useful_rag_answer(rag_answer): - # Build a prompt that asks for sub-queries to complement the existing RAG answer - prompt = ( - f"Original Query: \"{original_query}\"\n" - f"Current RAG Answer: \"{rag_answer}\"\n\n" - f"Generate {n} concise search queries (each ≤30 words) that would help update or fill gaps " - "in the RAG answer. Provide exactly one query per line. " - "Do not provide any additional commentary." - ) - else: - # No useful RAG answer: ask for sub-queries that collectively answer the original question - prompt = ( - f"Original Query: \"{original_query}\"\n\n" - f"No RAG knowledge is available. Generate {n} concise search queries " - "(each ≤30 words) that, when searched, would collectively answer the original query. " - "Provide exactly one query per line. Do not provide any additional commentary." - ) + prompt = instruction + task messages = [ - SystemMessage(content="You are a helpful assistant that generates search queries."), + SystemMessage(content="You are an assistant specializing in generating search queries."), HumanMessage(content=prompt), ] + response = self.llm.invoke(messages) - raw = self.clean_llm_output(response.content) + print("######") + print("LLM response: ", response.content) + print("######") + print("Clean response: ", self.clean_llm_output(response.content)) + print("--------------------") + return self.clean_llm_output(response.content) + + def integrate_with_web( + self, + original_query: str, + base_knowledge: str, + all_results: List[Dict[str, str]] + ) -> str: + """ + Integrate web results and base knowledge into a comprehensive answer. + """ + collated_sources = "\n".join([f"Source: {r['title']}, URL: {r['url']}" for r in all_results]) + prompt = ( + f"Original Query: {original_query}\n\n" + f"Base Knowledge: {base_knowledge}\n\n" + f"Web Results:\n{collated_sources}\n\n" + "Combine the base knowledge and web results into a detailed answer. " + "Enclose your response within ... tags." + ) - # Split lines, discard empty lines, strip numbering if present - lines = [line.strip() for line in raw.splitlines() if line.strip()] - subqueries: List[str] = [] - for line in lines: - # Remove leading numbering like “1. …” or “(1) …” - cleaned = re.sub(r'^\s*\d+[\).\s]+\s*', '', line) - subqueries.append(cleaned) - if len(subqueries) >= n: - break + messages = [ + SystemMessage(content="You are a multilingual research assistant."), + HumanMessage(content=prompt), + ] - # If LLM gave fewer lines, we can pad with the original query - while len(subqueries) < n: - subqueries.append(original_query) + response = self.llm.invoke(messages) + return self.clean_llm_output(response.content) - return subqueries @staticmethod def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: @@ -148,6 +181,7 @@ def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]] logger.info(f"Search error for \"{query}\": {e}") return [] + def integrate_with_web( self, original_query: str, @@ -155,53 +189,33 @@ def integrate_with_web( all_results: List[Dict[str, str]] ) -> str: """ - Given: - - original_query (str), - - base_knowledge (either a non‐empty RAG answer or just an empty string), - - all_results: list of {title, url} from all sub‐queries, - produce a final “enhanced answer” via the LLM. - We’ll ask the LLM to weave base_knowledge + new web info into a single answer. - We’ll ask it to output the answer enclosed in tags. + Integrate web results and base knowledge into a comprehensive answer. """ - # Collate sources into a single text blob. (We omit “body” because DuckDuckGo API - # only supplies title & URL. You could extend to fetch page snippets if you like.) - collated = "\n".join([f"Source: {r['title']}, URL: {r['url']}" for r in all_results]) - - if base_knowledge: - prompt = ( - f"Original Query: \"{original_query}\"\n\n" - f"Current Knowledge (RAG): \"{base_knowledge}\"\n\n" - f"New Information from Web:\n{collated}\n\n" - "Produce a comprehensive answer that integrates the RAG knowledge with the new web information. " - "Your final answer must be enclosed in tags. " - "If there are any remaining gaps, list them under “ADDITIONAL GAPS” after the tags." - ) - else: - prompt = ( - f"Original Query: \"{original_query}\"\n\n" - f"No prior RAG knowledge. New Information from Web:\n{collated}\n\n" - "Provide a comprehensive answer that directly addresses the original query, " - "based solely on these web results. Your final answer must be enclosed in tags. " - "If there are any remaining gaps, list them under “ADDITIONAL GAPS” after the tags." - ) + collated_sources = "\n".join([f"Source: {r['title']}, URL: {r['url']}" for r in all_results]) + prompt = ( + f"Original Query: {original_query}\n\n" + f"Base Knowledge: {base_knowledge}\n\n" + f"Web Results:\n{collated_sources}\n\n" + "Combine the base knowledge and web results into a detailed answer. " + "Enclose your response within ... tags." + ) messages = [ - SystemMessage(content="You are a research analyst that writes detailed answers."), + SystemMessage(content="You are a multilingual research assistant."), HumanMessage(content=prompt), ] + response = self.llm.invoke(messages) - return self.clean_llm_output(response.content) @staticmethod - def extract_enhanced_answer(text: str) -> Dict[str, Any]: + def extract_enhanced_answer(text: Optional[str]) -> Dict[str, Any]: """ - Extract the content between , and any “ADDITIONAL GAPS” afterwards. - Returns a dict: - { - "answer": , - "additional_gaps": - } + Extract the content between ..., and any “ADDITIONAL GAPS” afterwards. + Handles NoneType input gracefully. """ + if not text or not isinstance(text, str): + return {"answer": "", "additional_gaps": ""} + # 1) Extract between tags match = re.search(r"\s*(.*?)\s*", text, re.DOTALL | re.IGNORECASE) answer = match.group(1).strip() if match else text.strip() @@ -214,6 +228,7 @@ def extract_enhanced_answer(text: str) -> Dict[str, Any]: return {"answer": answer, "additional_gaps": gaps} + def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: """ Process a single record (dict). The record must contain at least: @@ -248,12 +263,14 @@ def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: all_sources: List[Dict[str, str]] = [] seen_urls: Set[str] = set() for subq in subqueries: + logger.debug(f"subquery: {subq}") results = self.duckduckgo_search(subq, max_results=self.config.max_searches) for r in results: if r["url"] not in seen_urls: all_sources.append(r) seen_urls.add(r["url"]) + # Step 4: Integrate RAG answer (if any) or just original query with all web results base_knowledge = rag_answer if useful else "" combined = self.integrate_with_web(user_query, base_knowledge, all_sources) @@ -328,7 +345,7 @@ def run(self): with open(out_path, "w", encoding="utf-8") as out_f: json.dump(all_outputs, out_f, indent=2, ensure_ascii=False) - logger.info(f"\nResults saved to {out_path}") + print(f"\nResults saved to {out_path}") From 43eb8cd94e510afb54d99d745ded1dbef58874cc Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Wed, 11 Jun 2025 04:33:31 -0700 Subject: [PATCH 05/33] query extraction solved --- src/mmore/websearchRAG/pipeline.py | 52 +++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 0966be3b..57039998 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -79,6 +79,20 @@ def clean_llm_output(self, content): else: return [] + def clean_llm_summary(self, content): + # Define the delimiter after which the subqueries are located + delimiter = "---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + + # Split the content based on the delimiter + if delimiter in content: + + subquery_section = content.split(delimiter, 1)[-1] + # Use regex to extract lines matching the subquery format + subquery_section = subquery_section.lower().strip() + return subquery_section + else: + return [] + @@ -117,12 +131,20 @@ def generate_subqueries( """ n = self.config.n_subqueries instruction = f"You have the question and partial answer below:\nQuestion: {original_query}\n\n" - task = ( + if rag_answer is None: + task = ( + f"Generate {n}-independant subqueries based the original query, in order to generate the most complete research. Each subquery must be concise and ≤30 words.\n" + f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" + "---------------------------" + ) + else: + task = ( f"Partial RAG Answer: {rag_answer}\n\n" f"Generate {n}-independant subqueries to refine the answer based on the original query. Each subquery must be concise and ≤30 words.\n" f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" "---------------------------" - ) + ) + prompt = instruction + task @@ -249,15 +271,17 @@ def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: # Decide if we have “useful” RAG info useful = False - if self.config.use_rag and self.is_useful_rag_answer(rag_answer): - useful = True - # Step 1: Optionally run a summary on the RAG answer if requested - if useful and self.config.rag_summary: - rag_answer = self.generate_summary(rag_answer, user_query) + # Step 1: Run a summary on the RAG answer if we want to use the rag else empty + if self.config.use_rag: + rag_summary = self.generate_summary(rag_answer, user_query) + else: + rag_summary = None + + print("Rag Summary", rag_summary) - # Step 2: Generate sub-queries - subqueries = self.generate_subqueries(user_query, rag_answer if useful else None) + # Step 2: Generate sub-queries - either RAG raw output or RAG summary + subqueries = self.generate_subqueries(user_query, rag_summary) # Step 3: For each sub-query, run DuckDuckGo search and collect sources all_sources: List[Dict[str, str]] = [] @@ -282,6 +306,7 @@ def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: return { "query": user_query, "rag_answer": rag_answer, + "rag_summary": rag_summary, "generated_subqueries": subqueries, "enhanced_answer": final_answer, "additional_gaps": final_gaps, @@ -354,12 +379,15 @@ def generate_summary(self, rag_answer: str, query: str) -> str: Summarize the RAG answer (used when rag_summary=True). """ prompt = ( - "You have only the following context to answer the question—do not use any external knowledge.\n\n" + "You have only the following context to answer the question, do not use any external knowledge.\n\n" f"Question: {query}\n\n" "Context:\n" f"{rag_answer}\n\n" - "If the context contains the answer or useful information, respond with that information. " + "If the context contains the answer or useful information, respond with that information. \n" + "If no useful informations are, answer: no useful informations" "Answer:" + "---------------------------" + ) messages = [ @@ -367,4 +395,4 @@ def generate_summary(self, rag_answer: str, query: str) -> str: HumanMessage(content=prompt), ] response = self.llm.invoke(messages) - return self.extract_answer_after_prefix(response.content, "Answer:") + return self.clean_llm_summary(response.content) From 7d0e86aa2ba2201d54250ef34a53bb57659681b1 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Sat, 14 Jun 2025 02:10:27 -0700 Subject: [PATCH 06/33] use_rag(true), use_summary(wrong): working --- src/mmore/run_websearch.py | 4 +- src/mmore/websearchRAG/config.py | 4 +- src/mmore/websearchRAG/pipeline.py | 429 +++++++++-------------------- 3 files changed, 141 insertions(+), 296 deletions(-) diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index fb248174..e8408540 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -35,6 +35,7 @@ class WebsearchSection: use_rag: bool rag_config_path: str use_summary: bool + n_subqueries : int input_file: str input_queries: str output_file: str @@ -67,7 +68,8 @@ def run_websearch(config_file: str): "input_file": ws.input_file, "input_queries": ws.input_queries, "output_file": ws.output_file, - "n_subqueries": ws.n_loops, + "n_subqueries": ws.n_subqueries, + "n_loops": ws.n_loops, "max_searches": ws.max_searches, "llm_config": ws.llm_config, } diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index a0802385..aec81589 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -30,13 +30,14 @@ class WebsearchConfig: input_file: str output_file: str n_subqueries: int + n_loops : int max_searches: int llm_config: Dict[str, Any] @staticmethod def from_dict(d: Dict[str, Any]) -> "WebsearchConfig": # Validate required keys - required = ["use_rag", "rag_summary", "input_file", "output_file", "n_subqueries", "max_searches", "llm_config"] + required = ["use_rag", "rag_summary", "input_file", "output_file", "n_loops", "n_subqueries", "max_searches", "llm_config"] for key in required: if key not in d: raise ValueError(f"Missing '{key}' in WebsearchConfig.") @@ -47,6 +48,7 @@ def from_dict(d: Dict[str, Any]) -> "WebsearchConfig": rag_summary=d["rag_summary"], input_file=d["input_file"], output_file=d["output_file"], + n_loops=d["n_loops"], n_subqueries=int(d["n_subqueries"]), max_searches=int(d["max_searches"]), llm_config=d["llm_config"], diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 57039998..f833894b 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -1,70 +1,93 @@ -# mmore/websearch/pipeline.py from .logging_config import logger - import json import re from pathlib import Path from typing import Dict, List, Any, Optional, Set import logging -import json - +import time from duckduckgo_search import DDGS from langchain_core.messages import HumanMessage, SystemMessage - - from ..run_rag import rag from ..run_rag import read_queries from ..rag.llm import LLM, LLMConfig from .config import WebsearchConfig - - #python -m mmore websearch --config-file examples/websearchRAG/config.yaml - - class WebsearchPipeline: """ - Pipeline for optionally running RAG first, then generating sub-queries, - performing web searches, and finally integrating everything via the LLM. + Pipeline for running RAG and iterative websearch loops, + integrating retrieved knowledge into enhanced answers. """ def __init__(self, config: WebsearchConfig): self.config = config - - # Initialize the LLM self.llm = self._initialize_llm() - - # Will store RAG results if we run RAG self.rag_results: Optional[List[Dict[str, Any]]] = None - - def _initialize_llm(self) -> LLM: - """ - Initialize the LLM using the updated configuration for the fine-tuned multilingual model. - """ if self.config.use_rag: - rag_config = self.config.access_rag_config() - llm_config_dict = rag_config.get("rag", {}).get("llm", None) - if llm_config_dict is None: - raise ValueError("Missing 'llm' config under 'rag' key in the RAG configuration.") - llm_config = LLMConfig(**llm_config_dict) - return LLM.from_config(llm_config) + rag_cfg = self.config.access_rag_config() + llm_conf = rag_cfg.get("rag", {}).get("llm") + if llm_conf is None: + raise ValueError("Missing 'llm' config under 'rag' in RAG configuration.") + return LLM.from_config(LLMConfig(**llm_conf)) else: - llm_config = self.config.get_llm_config() - # Ensure the model is configured for multilingual use - llm_config.language_support = "multilingual" - llm_config.model_name = "fine_tuned_multilingual_model" - return LLM.from_config(llm_config) + base_conf = self.config.get_llm_config() + return LLM.from_config(base_conf) + + def generate_summary(self, rag_answer: str, query: str) -> str: + """ + Summarize the RAG answer (used when rag_summary=True). + """ + prompt = ( + "You have only the following context to answer the question, do not use any external knowledge.\n\n" + f"Question: {query}\n\n" + "Context:\n" + f"{rag_answer}\n\n" + "If the context contains the answer or useful information, respond with that information. \n" + "If no useful informations are, answer: no useful informations" + "Answer:" + "---------------------------" + + ) + + messages = [ + SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + print("##SUMMARY CLEAN##") + print(self._clean_section(response.content)) + return self._clean_section(response.content) + + def _clean_section(self, content: str) -> str: + delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + subquery_section = content.split(delimiter)[-1].strip() + subquery_section = subquery_section.lower().strip() + print("##Current Summary##") + print(subquery_section) + print("##") + return subquery_section + + + + @staticmethod + def is_useful(text: str) -> bool: + t = text.strip().lower() + if not t or t.startswith("i don't know") or t.startswith("no"): + return False + return True + + def clean_llm_output(self, content): # Define the delimiter after which the subqueries are located - delimiter = "---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" # Split the content based on the delimiter if delimiter in content: @@ -73,53 +96,12 @@ def clean_llm_output(self, content): # Use regex to extract lines matching the subquery format subquery_section = subquery_section.lower().strip() subqueries = re.findall(r"subquery \d+: (.*)", subquery_section.strip()) - print(f"subquery_section:", subquery_section) - print('subquery outputs', subqueries) + #print(f"subquery_section:", subquery_section) + #print('subquery outputs', subqueries) return subqueries else: return [] - def clean_llm_summary(self, content): - # Define the delimiter after which the subqueries are located - delimiter = "---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - - # Split the content based on the delimiter - if delimiter in content: - - subquery_section = content.split(delimiter, 1)[-1] - # Use regex to extract lines matching the subquery format - subquery_section = subquery_section.lower().strip() - return subquery_section - else: - return [] - - - - - @staticmethod - def extract_answer_after_prefix(raw: str, prefix: str) -> str: - """ - Extract everything after a given prefix (case-insensitive). - If prefix not found, return raw. - """ - raw = WebsearchPipeline.clean_llm_output(raw) - pattern = re.compile(re.escape(prefix) + r"(.*)", re.IGNORECASE | re.DOTALL) - match = pattern.search(raw) - return match.group(1).strip() if match else raw - - @staticmethod - def is_useful_rag_answer(rag_answer: str) -> bool: - """ - Decide whether the RAG answer is “useful.” We treat nonempty answers - that do not just say “I don’t know” as useful. - """ - ans = rag_answer.strip().lower() - if not ans: - return False - # If it begins with a phrase like “i don’t know” or “no answer” etc. - if ans.startswith("i don’t know") or ans.startswith("i dont know") or ans.startswith("no"): - return False - return True def generate_subqueries( self, @@ -135,18 +117,16 @@ def generate_subqueries( task = ( f"Generate {n}-independant subqueries based the original query, in order to generate the most complete research. Each subquery must be concise and ≤30 words.\n" f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" - "---------------------------" + ) else: task = ( f"Partial RAG Answer: {rag_answer}\n\n" f"Generate {n}-independant subqueries to refine the answer based on the original query. Each subquery must be concise and ≤30 words.\n" f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" - "---------------------------" + f"---ANSWER ---" ) - - prompt = instruction + task messages = [ SystemMessage(content="You are an assistant specializing in generating search queries."), @@ -154,245 +134,106 @@ def generate_subqueries( ] response = self.llm.invoke(messages) - print("######") - print("LLM response: ", response.content) + #print("######") + #print("LLM response: ", response.content) print("######") print("Clean response: ", self.clean_llm_output(response.content)) print("--------------------") return self.clean_llm_output(response.content) - def integrate_with_web( - self, - original_query: str, - base_knowledge: str, - all_results: List[Dict[str, str]] - ) -> str: - """ - Integrate web results and base knowledge into a comprehensive answer. - """ - collated_sources = "\n".join([f"Source: {r['title']}, URL: {r['url']}" for r in all_results]) - prompt = ( - f"Original Query: {original_query}\n\n" - f"Base Knowledge: {base_knowledge}\n\n" - f"Web Results:\n{collated_sources}\n\n" - "Combine the base knowledge and web results into a detailed answer. " - "Enclose your response within ... tags." - ) - messages = [ - SystemMessage(content="You are a multilingual research assistant."), - HumanMessage(content=prompt), - ] - response = self.llm.invoke(messages) - return self.clean_llm_output(response.content) + @staticmethod def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: - """ - Perform a DuckDuckGo search and return a list of {'title', 'url'} dicts. - """ + time.sleep(1) try: with DDGS() as ddgs: - return [ - {"title": r.get("title", ""), "url": r.get("href", "")} - for r in ddgs.text(query, max_results=max_results) - ] + print("query:", query) + results = ddgs.text(query, max_results=max_results) + return [{"title": r.get("title",""), "url": r.get("href","")} for r in results] except Exception as e: - logger.info(f"Search error for \"{query}\": {e}") + logger.error(f"DuckDuckGo error: {e}") return [] - def integrate_with_web( + + def integrate_with_llm( self, - original_query: str, - base_knowledge: str, - all_results: List[Dict[str, str]] - ) -> str: - """ - Integrate web results and base knowledge into a comprehensive answer. - """ - collated_sources = "\n".join([f"Source: {r['title']}, URL: {r['url']}" for r in all_results]) + original: str, + rag_doc: str, + web_snippets: List[str] + ) -> Dict[str, str]: + # Build prompt for short & detailed answer + sources = "\n".join(web_snippets) prompt = ( - f"Original Query: {original_query}\n\n" - f"Base Knowledge: {base_knowledge}\n\n" - f"Web Results:\n{collated_sources}\n\n" - "Combine the base knowledge and web results into a detailed answer. " - "Enclose your response within ... tags." + f"Original Query: {original}\n" + f"RAG Document: {rag_doc}\n" + f"Web Snippets:\n{sources}\n" + "Provide output as: short answer: ..., detailed answer: ..." ) - - messages = [ - SystemMessage(content="You are a multilingual research assistant."), - HumanMessage(content=prompt), - ] - - response = self.llm.invoke(messages) - - @staticmethod - def extract_enhanced_answer(text: Optional[str]) -> Dict[str, Any]: - """ - Extract the content between ..., and any “ADDITIONAL GAPS” afterwards. - Handles NoneType input gracefully. - """ - if not text or not isinstance(text, str): - return {"answer": "", "additional_gaps": ""} - - # 1) Extract between tags - match = re.search(r"\s*(.*?)\s*", text, re.DOTALL | re.IGNORECASE) - answer = match.group(1).strip() if match else text.strip() - - # 2) Extract “ADDITIONAL GAPS:” block if present - gaps = "" - gap_match = re.search(r"ADDITIONAL GAPS:\s*(.*)$", text, re.DOTALL | re.IGNORECASE) - if gap_match: - gaps = gap_match.group(1).strip() - - return {"answer": answer, "additional_gaps": gaps} - - - def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: - """ - Process a single record (dict). The record must contain at least: - - "input": the original query (str) - - If use_rag=True, "answer": the RAG answer (str) for that query - (The RAG pipeline’s output JSON is expected to have those keys.) - - Returns a dict with: - - "query" (original input) - - "rag_answer" (empty string if use_rag=False) - - "generated_subqueries": [list of sub-queries] - - "enhanced_answer": the final answer (text inside ) - - "additional_gaps": any remaining gaps (empty str if none) - - "sources": [ { "title": ..., "url": ... } ] (all unique sources gathered) - """ - user_query = record.get("input", "").strip() - rag_answer = record.get("answer", "").strip() if self.config.use_rag else "" - - # Decide if we have “useful” RAG info - useful = False - - # Step 1: Run a summary on the RAG answer if we want to use the rag else empty - if self.config.use_rag: - rag_summary = self.generate_summary(rag_answer, user_query) - else: - rag_summary = None - - print("Rag Summary", rag_summary) - - # Step 2: Generate sub-queries - either RAG raw output or RAG summary - subqueries = self.generate_subqueries(user_query, rag_summary) - - # Step 3: For each sub-query, run DuckDuckGo search and collect sources - all_sources: List[Dict[str, str]] = [] - seen_urls: Set[str] = set() - for subq in subqueries: - logger.debug(f"subquery: {subq}") - results = self.duckduckgo_search(subq, max_results=self.config.max_searches) - for r in results: - if r["url"] not in seen_urls: - all_sources.append(r) - seen_urls.add(r["url"]) - - - # Step 4: Integrate RAG answer (if any) or just original query with all web results - base_knowledge = rag_answer if useful else "" - combined = self.integrate_with_web(user_query, base_knowledge, all_sources) - - extracted = self.extract_enhanced_answer(combined) - final_answer = extracted["answer"] - final_gaps = extracted["additional_gaps"] - + msgs = [SystemMessage(content="You are a research assistant."), HumanMessage(content=prompt)] + resp = self.llm.invoke(msgs) + # parse + sa = re.search(r"short answer:\s*(.*?)\s*(?=detailed answer:)", resp.content, flags=re.IGNORECASE|re.DOTALL) + da = re.search(r"detailed answer:\s*(.*)", resp.content, flags=re.IGNORECASE|re.DOTALL) + return {"short": sa.group(1).strip() if sa else "", "detailed": da.group(1).strip() if da else ""} + + def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: + qr = rec.get("input","").strip() + # RAG step + rag_ans = rec.get("answer","") if self.config.use_rag else "" + rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None + # iterative loops + all_sources: Set[str] = set() + current_context = rag_summary or "" + final_short, final_detailed = "", "" + for loop in range(self.config.n_loops): + # generate subqueries + subs = self.generate_subqueries(qr, current_context) + snippets, urls = [], [] + for sq in subs: + print("subquery: ", sq) + res = self.duckduckgo_search(sq, max_results=self.config.max_searches) + for r in res: + if r["url"] not in all_sources: + all_sources.add(r["url"]) + # For simplicity assume snippet = title + snippets.append(f"{r['title']} ({r['url']})") + # integrate with LLM + out = self.integrate_with_llm(qr, current_context, snippets) + final_short, final_detailed = out["short"], out["detailed"] + # next context = detailed answer + current_context = final_detailed return { - "query": user_query, - "rag_answer": rag_answer, + "query": qr, "rag_summary": rag_summary, - "generated_subqueries": subqueries, - "enhanced_answer": final_answer, - "additional_gaps": final_gaps, - "sources": all_sources, + "short_answer": final_short, + "detailed_answer": final_detailed, + "sources": list(all_sources) } def run(self): - """ - 1) If use_rag=True: invoke run_rag(...) to produce RAG output JSON. - 2) Read self.config.input_file (either original queries or RAG output) as a JSON array. - 3) For each record, call process_record(...) and collect outputs. - 4) Write all outputs to self.config.output_file as JSON array. - """ - # If use_rag=True, run the RAG pipeline first + # RAG pipeline if self.config.use_rag: if not self.config.rag_config_path: - raise ValueError("rag_config_path is required when use_rag=True.") - - logger.info(f"Running RAG pipeline with config: {self.config.rag_config_path}") - # We expect run_rag(...) to return a List[Dict] if return_results=True. + raise ValueError("rag_config_path required when use_rag=True") + logger.info("Running RAG pipeline...") rag(self.config.rag_config_path) - logger.info("RAG pipeline completed.") - - rag_cfg = self.config.access_rag_config() - output_file = rag_cfg["mode_args"]["output_file"] - self.config.input_file = output_file - - logger.info("Updated input file for the pipeline") - - #resume RAG also - - - # The RAG pipeline also writes its own JSON to rag_config.mode_args.output_file. - # We assume that output path equals self.config.input_file at this point. - # (In practice, you should set self.config.input_file to that path in your YAML.) - else: - self.rag_results = None - #no rag --> generate queries - - - - #Step 2: - - - - # Step 2: Load the JSON to process. It should be a list of records. - with open(self.config.input_file, "r", encoding="utf-8") as f: - data: List[Dict[str, Any]] = json.load(f) - - # Step 3: Process each record - all_outputs: List[Dict[str, Any]] = [] - for record in data: - out = self.process_record(record) - all_outputs.append(out) - - - - # Step 4: Save results - out_path = Path(self.config.output_file) - out_path.parent.mkdir(parents=True, exist_ok=True) - with open(out_path, "w", encoding="utf-8") as out_f: - json.dump(all_outputs, out_f, indent=2, ensure_ascii=False) - - print(f"\nResults saved to {out_path}") - - - - def generate_summary(self, rag_answer: str, query: str) -> str: - """ - Summarize the RAG answer (used when rag_summary=True). - """ - prompt = ( - "You have only the following context to answer the question, do not use any external knowledge.\n\n" - f"Question: {query}\n\n" - "Context:\n" - f"{rag_answer}\n\n" - "If the context contains the answer or useful information, respond with that information. \n" - "If no useful informations are, answer: no useful informations" - "Answer:" - "---------------------------" - - ) - - messages = [ - SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), - HumanMessage(content=prompt), - ] - response = self.llm.invoke(messages) - return self.clean_llm_summary(response.content) + rc = self.config.access_rag_config() + self.config.input_file = rc["mode_args"]["output_file"] + + # load input + with open(self.config.input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + outputs = [] + for rec in data: + outputs.append(self.process_record(rec)) + # save + outp = Path(self.config.output_file) + outp.parent.mkdir(exist_ok=True, parents=True) + with open(outp, 'w', encoding='utf-8') as f: + json.dump(outputs, f, ensure_ascii=False, indent=2) + print(f"Results saved to {outp}") From ed72368270920488f821c3d7a7efc98de8be72ea Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Sun, 15 Jun 2025 05:16:26 -0700 Subject: [PATCH 07/33] local webrag working --- examples/websearchRAG/config.yaml | 7 +- .../websearchRAG/enhanced_results_trial.json | 88 +++++++ src/mmore/websearchRAG/config.py | 2 + src/mmore/websearchRAG/pipeline.py | 236 ++++++++++++------ 4 files changed, 260 insertions(+), 73 deletions(-) create mode 100644 examples/websearchRAG/enhanced_results_trial.json diff --git a/examples/websearchRAG/config.yaml b/examples/websearchRAG/config.yaml index e9c90c5d..2a9a60d7 100644 --- a/examples/websearchRAG/config.yaml +++ b/examples/websearchRAG/config.yaml @@ -1,12 +1,13 @@ websearch: - use_rag: true + use_rag: false rag_config_path: examples/rag/config.yaml use_summary: false + n_subqueries : 3 input_file: examples/rag/output.json input_queries: examples/rag/queries.jsonl output_file: examples/websearchRAG/enhanced_results_trial.json n_loops: 2 max_searches: 10 llm_config: - temperature: 0.2 - llm: OpenMeditron/meditron3-8b + llm_name: OpenMeditron/meditron3-8b + max_new_tokens: 250 \ No newline at end of file diff --git a/examples/websearchRAG/enhanced_results_trial.json b/examples/websearchRAG/enhanced_results_trial.json new file mode 100644 index 00000000..54f940d1 --- /dev/null +++ b/examples/websearchRAG/enhanced_results_trial.json @@ -0,0 +1,88 @@ +[ + { + "query": "When was Barack Obama born?", + "rag_summary": null, + "web_summary": null, + "short_answer": "barack obama was born on august 4, 1961, in honolulu, hawaii.", + "detailed_answer": "barack obama was born in honolulu, hawaii, to barack obama sr., a kenyan economist, and ann dunham, an anthropologist from kansas. his parents met while attending the university of hawaii. they divorced when barack was two years old, leaving ann to raise him primarily on her own. obama graduated from columbia university and harvard law school. he was elected to the illinois senate in 1996 and the u.s. senate in 2004. in 2008, he became the first african-american president of the united states, serving two terms from 2009 to 2017.", + "sources": [ + "https://www.providencejournal.com/story/news/politics/2025/06/14/president-donald-trumps-birthday-is-june-14-how-old-is-he/84147169007/", + "https://www.aplustopper.com/essay-on-barack-obama/", + "https://kvia.com/politics/cnn-us-politics/2025/02/05/barack-obama-fast-facts-2/", + "https://www.mapsofworld.com/usa/presidents/barack-obama.html", + "https://www.havefunwithhistory.com/barack-obama-timeline/", + "https://facts.net/history/historical-events/38-facts-about-barack-obama-elected-u-s-president/", + "https://www.cnbctv18.com/world/barack-obama-turns-62-all-about-former-us-president-and-his-famous-quotes-19453865.htm", + "https://www.britannica.com/biography/Barack-Obama", + "https://www.britannica.com/facts/Barack-Obama", + "https://www.politifact.com/factchecks/2025/jan/13/threads-posts/no-this-isnt-proof-former-president-barack-obama-w/", + "https://www.britannica.com/biography/Barack-Obama/Politics-and-ascent-to-the-presidency", + "https://thegrio.com/2025/03/14/barack-obama/", + "https://publicfinanceinternational.org/when-is-obamas-birthday/" + ] + }, + { + "query": "Who founded Google?", + "rag_summary": null, + "web_summary": null, + "short_answer": "larry page and sergey brin founded google in 1998.", + "detailed_answer": "google was founded in 1998 by two ph.d. students at stanford university, larry page and sergey brin. initially, the company focused on developing a search engine that could provide relevant and accurate search results. over the years, google expanded its services to include a wide range of products and services, such as google drive, google maps, youtube, and google assistant. today, google is one of the most influential tech companies globally, with over 140,000 employees and a market value of over $1 trillion.", + "sources": [ + "https://worldhistoryjournal.com/2025/02/21/how-google-started-and-became-a-global-tech-powerhouse/", + "https://newswirejet.com/google-mission-and-vision-statement/", + "https://www.indiatoday.in/technology/news/story/google-co-founder-sergey-brin-is-back-from-semi-retirement-is-now-daily-working-at-google-2728026-2025-05-21", + "https://umatechnology.org/a-brief-history-of-google-from-1998-to-the-present-day/", + "https://www.britannica.com/money/Google-Inc", + "https://www.businessinsider.com/sergey-brin-career-life-education?op=1", + "https://www.reference.com/business-finance/google-s-vision-statement-66f207e3583fca5f-14", + "https://panmore.com/google-vision-statement-mission-statement", + "https://biographyhost.com/p/larry-page-biography.html", + "https://canvasbusinessmodel.com/blogs/mission/google-mission", + "https://interestingengineering.com/culture/almost-everything-you-need-to-know-about-googles-history", + "https://en.wikipedia.org/wiki/Larry_Page", + "https://www.link-assistant.com/news/how-old-is-google.html" + ] + }, + { + "query": "Where is the Eiffel Tower located?", + "rag_summary": null, + "web_summary": null, + "short_answer": "the eiffel tower is located in paris, france.", + "detailed_answer": "the eiffel tower is a wrought-iron structure located in paris, france. it is situated on the banks of the seine river in the 7th arrondissement of paris. the tower was designed and built by gustave eiffel and his team between 1887 and 1889 for the 1889 exposition universelle, a world's fair celebrating the 100th anniversary of the french revolution. the tower stands at an official height of 984 feet (300 meters), including its antennas, and is made of 18,038 pieces of metal connected with 2.5 million rivets. it is considered a technological masterpiece in building-construction history and is one of the most recognizable structures in the world, drawing millions of visitors each year.", + "sources": [ + "https://artenquire.com/2024/08/28/who-built-the-eiffel-tower-and-why/", + "https://duitdesign.com/who-designed-the-eiffel-tower-and-when-was-it-built.html", + "https://www.solosophie.com/eiffel-tower-wasnt-designed-by-gustave-eiffel/", + "https://heightandsize.com/eiffel-tower-height/", + "https://travelpander.com/eiffel-tower-history-and-facts/", + "https://www.travelandleisure.com/attractions/landmarks-monuments/eiffel-tower-facts", + "https://www.britannica.com/topic/Eiffel-Tower-Paris-France", + "https://travelpander.com/does-the-eiffel-tower-have-a-purpose/", + "https://travelpander.com/eiffel-tower-why-is-it-important/", + "https://parade.com/travel/eiffel-tower-facts", + "https://www.architecturaldigest.com/story/eiffel-tower-everything-you-need-to-know" + ] + }, + { + "query": "When will the artificial intelligence conference be held?", + "rag_summary": null, + "web_summary": null, + "short_answer": "the ai conference will be held on september 17-18, 2025, in san francisco, california, us.", + "detailed_answer": "the ai conference is a two-day in-person conference discussing the latest in ai. it will be held on september 17-18, 2025, in san francisco, california, us. the conference will cover topics such as what's working in the best applied-ai startups, and technical lessons into the nuances of neural architectures, foundational models, alignment, and more. the cost of attending the conference ranges from $199.", + "sources": [ + "https://aaai.org/conference/aaai/aaai-25/main-technical-track/", + "https://engine.com/business-travel-guide/biggest-ai-expos-events-conferences-2025", + "https://www.iso.org/news/2025/01/world-first-international-ai-standards-summit-announced-in-davos", + "https://www.analyticsinsight.net/artificial-intelligence/top-ai-conferences-to-watch-out-for-in-2025", + "https://oxfordabstracts.com/blog/top-ai-conferences-to-attend/", + "https://www.forbes.com/sites/dianaspehar/2025/02/10/paris-ai-summit-2025-5-critical-themes-shaping-global-ai-policy/", + "https://vsynergize.com/blog/top-6-global-ai-summit-and-conferences-in-2025/", + "https://aaai.org/conference/aaai/aaai-25/", + "https://www.datacamp.com/blog/top-ai-conferences", + "https://www.analyticsvidhya.com/blog/2025/06/ai-conferences-2025/", + "https://datasciencedojo.com/blog/top-ai-conferences-in-usa/", + "https://www.ip-paris.fr/en/news/ai-action-summit-conference-ai-science-and-society-ip-paris", + "https://onu.delegfrance.org/ai-action-summit-10-and-11-february-2025" + ] + } +] \ No newline at end of file diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index aec81589..ce23d376 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -28,6 +28,7 @@ class WebsearchConfig: use_rag: bool rag_summary: bool input_file: str + input_queries: str output_file: str n_subqueries: int n_loops : int @@ -47,6 +48,7 @@ def from_dict(d: Dict[str, Any]) -> "WebsearchConfig": use_rag=d["use_rag"], rag_summary=d["rag_summary"], input_file=d["input_file"], + input_queries=d["input_queries"], output_file=d["output_file"], n_loops=d["n_loops"], n_subqueries=int(d["n_subqueries"]), diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index f833894b..cb3c286a 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -7,7 +7,10 @@ import logging import time -from duckduckgo_search import DDGS +from langchain_community.tools import DuckDuckGoSearchResults +from langchain_community.utilities import DuckDuckGoSearchAPIWrapper + + from langchain_core.messages import HumanMessage, SystemMessage from ..run_rag import rag @@ -15,7 +18,11 @@ from ..rag.llm import LLM, LLMConfig from .config import WebsearchConfig -#python -m mmore websearch --config-file examples/websearchRAG/config.yaml + + +# python -m mmore websearch --config-file examples/websearchRAG/config.yaml + + class WebsearchPipeline: """ @@ -29,15 +36,20 @@ def __init__(self, config: WebsearchConfig): self.rag_results: Optional[List[Dict[str, Any]]] = None def _initialize_llm(self) -> LLM: - if self.config.use_rag: + if self.config.use_rag is True: rag_cfg = self.config.access_rag_config() llm_conf = rag_cfg.get("rag", {}).get("llm") if llm_conf is None: raise ValueError("Missing 'llm' config under 'rag' in RAG configuration.") return LLM.from_config(LLMConfig(**llm_conf)) - else: + elif self.config.use_rag is False: base_conf = self.config.get_llm_config() - return LLM.from_config(base_conf) + if isinstance(base_conf, LLMConfig): # Ensure it's a dictionary + base_conf = base_conf.__dict__ + return LLM.from_config(LLMConfig(**base_conf)) + else: + raise ValueError("Invalid value for 'use_rag'. Must be True or False.") + def generate_summary(self, rag_answer: str, query: str) -> str: """ @@ -48,12 +60,14 @@ def generate_summary(self, rag_answer: str, query: str) -> str: f"Question: {query}\n\n" "Context:\n" f"{rag_answer}\n\n" - "If the context contains the answer or useful information, respond with that information. \n" + "If the context contains the answer or any useful information, respond with that information. \n" "If no useful informations are, answer: no useful informations" "Answer:" "---------------------------" - ) + + if not self.config.use_rag: + return None messages = [ SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), @@ -68,13 +82,11 @@ def _clean_section(self, content: str) -> str: delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" subquery_section = content.split(delimiter)[-1].strip() subquery_section = subquery_section.lower().strip() - print("##Current Summary##") + print("##Current Response##") print(subquery_section) print("##") return subquery_section - - @staticmethod def is_useful(text: str) -> bool: t = text.strip().lower() @@ -82,13 +94,10 @@ def is_useful(text: str) -> bool: return False return True - - - def clean_llm_output(self, content): # Define the delimiter after which the subqueries are located delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - + # Split the content based on the delimiter if delimiter in content: @@ -96,37 +105,34 @@ def clean_llm_output(self, content): # Use regex to extract lines matching the subquery format subquery_section = subquery_section.lower().strip() subqueries = re.findall(r"subquery \d+: (.*)", subquery_section.strip()) - #print(f"subquery_section:", subquery_section) - #print('subquery outputs', subqueries) return subqueries else: return [] - def generate_subqueries( self, original_query: str, - rag_answer: Optional[str] = None + current_context: Optional[str] = None ) -> List[str]: """ - Generate concise search subqueries using the fine-tuned multilingual model. + Generate concise search subqueries """ n = self.config.n_subqueries instruction = f"You have the question and partial answer below:\nQuestion: {original_query}\n\n" - if rag_answer is None: + if current_context is None: task = ( - f"Generate {n}-independant subqueries based the original query, in order to generate the most complete research. Each subquery must be concise and ≤30 words.\n" - f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" - + f"Generate {n}-independant subqueries based on the original query, in order to generate the most complete research. Each subquery must be concise and ≤30 words.\n" + f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" ) else: task = ( - f"Partial RAG Answer: {rag_answer}\n\n" - f"Generate {n}-independant subqueries to refine the answer based on the original query. Each subquery must be concise and ≤30 words.\n" - f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" - f"---ANSWER ---" + f"Partial answer: {current_context}\n\n" + f"Generate {n}-independant subqueries to refine the answer based on the original query. Each subquery must be concise and ≤30 words.\n" + f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" + f"---ANSWER ---" ) + prompt = instruction + task messages = [ SystemMessage(content="You are an assistant specializing in generating search queries."), @@ -134,87 +140,171 @@ def generate_subqueries( ] response = self.llm.invoke(messages) - #print("######") - #print("LLM response: ", response.content) print("######") print("Clean response: ", self.clean_llm_output(response.content)) print("--------------------") return self.clean_llm_output(response.content) - - - - + # @staticmethod + # def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: + # time.sleep(2) + # try: + # with DDGS() as ddgs: + # print("query:", query) + # results = ddgs.text(query, max_results=max_results) + # return [{"title": r.get("title", ""), "url": r.get("href", "")} for r in results] + # except Exception as e: + # logger.error(f"DuckDuckGo error: {e}") + # return [] @staticmethod def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: - time.sleep(1) + """ + Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper. + + Returns a list of dicts with keys: 'title' and 'url'. + """ + time.sleep(2) # polite delay try: - with DDGS() as ddgs: - print("query:", query) - results = ddgs.text(query, max_results=max_results) - return [{"title": r.get("title",""), "url": r.get("href","")} for r in results] + wrapper = DuckDuckGoSearchAPIWrapper(max_results=max_results) + search = DuckDuckGoSearchResults(api_wrapper=wrapper, output_format="list") + # Use run() method with output_format="list" to get list of dicts + results = search.invoke(query) + # Each item is expected to have keys like: 'title', 'link', 'snippet' + # Map 'link' to 'url' for compatibility with existing code + formatted_results = [] + for r in results: + snippet = r.get("snippet", "") + url = r.get("link", "") # note: it's "link" in LangChain results + if url: + formatted_results.append({"snippet": snippet, "url": url}) + return formatted_results except Exception as e: - logger.error(f"DuckDuckGo error: {e}") + logger.error(f"DuckDuckGo search error: {e}") return [] - def integrate_with_llm( - self, - original: str, - rag_doc: str, - web_snippets: List[str] - ) -> Dict[str, str]: + + + def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[str]) -> Dict[str, str]: # Build prompt for short & detailed answer sources = "\n".join(web_snippets) prompt = ( - f"Original Query: {original}\n" - f"RAG Document: {rag_doc}\n" - f"Web Snippets:\n{sources}\n" - "Provide output as: short answer: ..., detailed answer: ..." - ) + f"Original Query: {original}\n" + f"RAG Document Information:\n{rag_doc}\n\n" + f"Web Information:\n{sources}\n\n" + "Provide the response in the following format:\n" + "short answer: \n" + "detailed answer: " + ) + + msgs = [SystemMessage(content="You are a research assistant."), HumanMessage(content=prompt)] resp = self.llm.invoke(msgs) # parse - sa = re.search(r"short answer:\s*(.*?)\s*(?=detailed answer:)", resp.content, flags=re.IGNORECASE|re.DOTALL) - da = re.search(r"detailed answer:\s*(.*)", resp.content, flags=re.IGNORECASE|re.DOTALL) - return {"short": sa.group(1).strip() if sa else "", "detailed": da.group(1).strip() if da else ""} + clean_content = self._clean_section(resp.content) + + sa_matches = re.findall( + r"short answer:\s*(.*?)(?=detailed answer:)", + clean_content, + flags=re.IGNORECASE|re.DOTALL + ) + da_matches = re.findall( + r"detailed answer:\s*(.*)", + clean_content, + flags=re.IGNORECASE|re.DOTALL + ) + short = sa_matches[-1].strip().rstrip(",") if sa_matches else "" + detailed = da_matches[-1].strip() if da_matches else "" + return {"short": short, "detailed": detailed} + + + + + def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: - qr = rec.get("input","").strip() - # RAG step - rag_ans = rec.get("answer","") if self.config.use_rag else "" + qr = rec.get("input", "").strip() + rag_ans = rec.get("answer", "") if self.config.use_rag else "" rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None - # iterative loops + all_sources: Set[str] = set() current_context = rag_summary or "" final_short, final_detailed = "", "" + web_summary = "" + + web_summaries = [] + for loop in range(self.config.n_loops): - # generate subqueries - subs = self.generate_subqueries(qr, current_context) + if self.config.use_rag: + subs = self.generate_subqueries(qr, current_context) + else: + subs = self.generate_subqueries(qr) # Based on original query only + snippets, urls = [], [] + subquery_summaries = [] + for sq in subs: - print("subquery: ", sq) + #print("subquery:", sq) res = self.duckduckgo_search(sq, max_results=self.config.max_searches) + + subquery_snippets = [] for r in res: if r["url"] not in all_sources: all_sources.add(r["url"]) - # For simplicity assume snippet = title - snippets.append(f"{r['title']} ({r['url']})") - # integrate with LLM - out = self.integrate_with_llm(qr, current_context, snippets) + snippet = f"{r['snippet']})" + snippets.append(snippet) + #print("Current sub-snippet", snippet) + subquery_snippets.append(snippet) + + # Summarize each subquery's snippets independently if rag_summary is True + if self.config.rag_summary: + if subquery_snippets: + combined_snippets = "\n".join(subquery_snippets) + summary = self.generate_summary(combined_snippets, sq) + subquery_summaries.append(summary) + else: + subquery_summaries.append("") + + if self.config.rag_summary: + combined_sub_summaries = "\n".join([str(s) if s else "" for s in subquery_summaries]) + web_summary = self.generate_summary(combined_sub_summaries, qr) + web_summaries.append(web_summary) + #print("Current websummary: ", web_summary) + + # Combine rag summary, web summary, and original query for final integration + context_for_llm = f"RAG informations:\n{rag_summary or ''}\n\nWeb informations:\n{web_summary}" + else: + # If not summarizing subqueries, use rag summary or current context with snippets + context_for_llm = current_context + + combined_web_summaries = "\n".join([str(s) if s else "" for s in web_summaries]) + web_summary_all = self.generate_summary(combined_web_summaries, qr) + + # Integrate all info with LLM + out = self.integrate_with_llm(qr, context_for_llm, snippets) final_short, final_detailed = out["short"], out["detailed"] - # next context = detailed answer + + # Prepare context for next loop iteration current_context = final_detailed + + + return { "query": qr, - "rag_summary": rag_summary, + "rag_summary": rag_summary if self.config.use_rag else None, + "web_summary": web_summary_all if self.config.rag_summary else None, "short_answer": final_short, "detailed_answer": final_detailed, - "sources": list(all_sources) + "sources": list(all_sources), } + + + + + def run(self): # RAG pipeline if self.config.use_rag: @@ -224,10 +314,16 @@ def run(self): rag(self.config.rag_config_path) rc = self.config.access_rag_config() self.config.input_file = rc["mode_args"]["output_file"] + with open(self.config.input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + else: + self.config.input_file = self.config.input_queries + data = [] + with open(self.config.input_file, 'r', encoding='utf-8') as f: + for line in f: + data.append(json.loads(line.strip())) # JSONL format + - # load input - with open(self.config.input_file, 'r', encoding='utf-8') as f: - data = json.load(f) outputs = [] for rec in data: outputs.append(self.process_record(rec)) From ffdb4c969973b7a16faeac8e2daf537fc88a9e73 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Tue, 17 Jun 2025 05:43:44 -0700 Subject: [PATCH 08/33] clean folder --- examples/websearch/config.yaml | 8 - .../websearch/enhanced_results_trial.json | 234 ------------------ examples/websearchRAG/config.yaml | 3 +- src/mmore/run_websearch.py | 113 +++++---- src/mmore/websearch/__init__.py | 6 - src/mmore/websearch/config.py | 42 ---- src/mmore/websearch/llm.py | 57 ----- src/mmore/websearch/pipeline.py | 234 ------------------ src/mmore/websearchRAG/config.py | 43 ++-- src/mmore/websearchRAG/llm.py | 41 --- src/mmore/websearchRAG/pipeline.py | 102 +++----- 11 files changed, 123 insertions(+), 760 deletions(-) delete mode 100644 examples/websearch/config.yaml delete mode 100644 examples/websearch/enhanced_results_trial.json delete mode 100644 src/mmore/websearch/__init__.py delete mode 100644 src/mmore/websearch/config.py delete mode 100644 src/mmore/websearch/llm.py delete mode 100644 src/mmore/websearch/pipeline.py delete mode 100644 src/mmore/websearchRAG/llm.py diff --git a/examples/websearch/config.yaml b/examples/websearch/config.yaml deleted file mode 100644 index 76fa4e71..00000000 --- a/examples/websearch/config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -input_file: "examples/rag/output.json" -output_file: "examples/websearch/enhanced_results_trial.json" -n_loops: 2 -llm_name: "OpenMeditron/meditron3-8b" -max_searches: 10 -llm_config: - max_new_tokens: 2000 - temperature: 0.2 \ No newline at end of file diff --git a/examples/websearch/enhanced_results_trial.json b/examples/websearch/enhanced_results_trial.json deleted file mode 100644 index 63fe5bcf..00000000 --- a/examples/websearch/enhanced_results_trial.json +++ /dev/null @@ -1,234 +0,0 @@ -[ - { - "query": "When was Barack Obama born?", - "RAG_summary": "1961", - "WEB_RAG_summary": "Barack Obama was born in 1961. He was born in Honolulu, Hawaii, to Ann Dunham and Barack Obama Sr. His father was a Kenyan economist, and his mother was an American anthropologist. Obama's parents divorced when he was two years old, and he was raised by his mother and maternal grandparents in Hawaii. He attended Punahou School, a private college preparatory school in Honolulu, and graduated in 1979. Obama then attended Occidental College in Los Angeles for two years before transferring to Columbia University in New York City, where he earned a Bachelor of Arts degree in political science in 1983. After college, Obama worked as a community organizer in Chicago for three years. He then attended Harvard Law School, where he earned a Juris Doctor (J.D.) degree in 1991. Obama's political career began in 1996, when he was elected to the Illinois State Senate. He served three terms in the state senate before being elected to the U.S. Senate in 2004. Obama was the first African American to be elected to the U.S. Senate from Illinois. In 2008, Obama was elected as the 44th President of the United States, becoming the first African American to hold the office. He was re-elected in 2012. Obama's presidency was marked by several significant accomplishments, including the passage of the Affordable Care Act (ACA), also known as Obamacare, and the Dodd-Frank Wall Street Reform and Consumer Protection Act. He also oversaw the end of the Iraq War and the beginning of the end of the War in Afghanistan. Obama was awarded the Nobel Peace Prize in 2009 for his efforts to strengthen international diplomacy and cooperation. After leaving office in 2017, Obama has remained active in politics, advocating for progressive causes and supporting Democratic candidates in the 2018 midterm elections. He has also written several books, including Dreams from My Father and The Audacity of Hope.", - "WEBSEARCH_details": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii. His father, Barack Obama Sr., was a Kenyan economist, and his mother, Ann Dunham, was an American anthropologist. Obama's parents divorced when he was two years old, and he was raised by his mother and maternal grandparents in Hawaii. He attended Punahou School, a private college preparatory school in Honolulu, and graduated in 1979. Obama then attended Occidental College in Los Angeles for two years before transferring to Columbia University in New York City, where he earned a Bachelor of Arts degree in political science in 1983. After college, Obama worked as a community organizer in Chicago for three years. He then attended Harvard Law School, where he earned a Juris Doctor (J.D.) degree in 1991. Obama's political career began in 1996, when he was elected to the Illinois State Senate. He served three terms in the state senate before being elected to the U.S. Senate in 2004. Obama was the first African American to be elected to the U.S. Senate from Illinois. In 2008, Obama was elected as the 44th President of the United States, becoming the first African American to hold the office. He was re-elected in 2012. Obama's presidency was marked by several significant accomplishments, including the passage of the Affordable Care Act (ACA), also known as Obamacare, and the Dodd-Frank Wall Street Reform and Consumer Protection Act. He also oversaw the end of the Iraq War and the beginning of the end of the War in Afghanistan. Obama was awarded the Nobel Peace Prize in 2009 for his efforts to strengthen international diplomacy and cooperation. After leaving office in 2017, Obama has remained active in politics, advocating for progressive causes and supporting Democratic candidates in the 2018 midterm elections. He has also written several books, including Dreams from My Father and The Audacity of Hope.", - "sources": [ - { - "title": "Early life and career of Barack Obama - Wikipedia", - "url": "https://en.wikipedia.org/wiki/Early_life_and_career_of_Barack_Obama" - }, - { - "title": "Barack Obama | Biography, Parents, Education, Presidency, Books ...", - "url": "https://www.britannica.com/biography/Barack-Obama" - }, - { - "title": "Kenyan Government Releases Obama's Real Birth Certificate", - "url": "https://www.snopes.com/fact-check/birthing-pains/" - }, - { - "title": "Barack Obama: Biography, 44th U.S. President, Politician", - "url": "https://www.biography.com/political-figures/barack-obama" - }, - { - "title": "Barack Obama - Age, Education & Mother - HISTORY", - "url": "https://www.history.com/articles/barack-obama" - }, - { - "title": "President Barack Obama | Barack Obama Presidential Library", - "url": "https://www.obamalibrary.gov/obamas/president-barack-obama" - }, - { - "title": "Barack Obama | The White House", - "url": "https://bidenwhitehouse.archives.gov/about-the-white-house/presidents/barack-obama/" - }, - { - "title": "Barack Obama - National Geographic Kids", - "url": "https://kids.nationalgeographic.com/history/article/barack-obama" - }, - { - "title": "President Obama | The Obama Foundation", - "url": "https://www.obama.org/democracy-forum-2023/president-obama/" - }, - { - "title": "Barack Obama Timeline - Have Fun With History", - "url": "https://www.havefunwithhistory.com/barack-obama-timeline/" - }, - { - "title": "Barack Obama - Wikipedia", - "url": "https://en.wikipedia.org/wiki/Barack_Obama" - }, - { - "title": "10 Facts About Barack Obama - Have Fun With History", - "url": "https://www.havefunwithhistory.com/facts-about-barack-obama/" - } - ] - }, - { - "query": "Who founded Google?", - "RAG_summary": "Google was founded by Larry Page and Sergey Brin in 1998.", - "WEB_RAG_summary": "Google was founded by Larry Page and Sergey Brin in 1998. The founders were students at Stanford University in California. They created Google as a research project for their Ph.D. Thesis. The name Google was derived from the mathematical term \"googol,\" which means a huge number. Google was initially created to organize the world's information and make it universally accessible and useful. Google's first product was a search engine that allowed users to search the web for information. Google's search engine was unique because it used a new algorithm that ranked web pages based on their relevance to the search query. Google's search engine quickly became popular and the company began to expand its product offerings. Today, Google is a multinational technology company that specializes in Internet-related services and products. Google's main products include the Google search engine, Google Maps, Google Drive, Google Docs, Google Sheets, Google Slides, Google Calendar, Google Hangouts, Google Photos, Google Play, and Google Chrome. Google is headquartered in Mountain View, California, and is led by CEO Sundar Pichai. Google is a subsidiary of Alphabet Inc., a multinational conglomerate created in 2015 to manage Google and its other subsidiaries. Google is one of the most valuable companies in the world, with a market capitalization of over $1 trillion. Google's mission is to organize the world's information and make it universally accessible and useful. Google's vision is to be the most trusted and respected company in the world. Google's values include innovation, simplicity, and user-centricity. Google's products and services are designed to make people's lives easier and more productive. Google's products and services are used by billions of people around the world every day. Google's products and services are available in over 100 languages and are used in over 190 countries. Google's products and services are available on desktop computers, laptops, tablets, smartphones, and smart home devices. Google's products and services are free to use and are supported by advertising. Google's advertising products include Google Ads, Google AdSense, and Google Analytics. Google's advertising products allow businesses to reach their target audience and measure the effectiveness of their advertising campaigns. Google's advertising products are used by businesses of all sizes, from small businesses to multinational corporations. Google's advertising products are available in over 100 countries and are used by businesses in a wide range of industries. Google's advertising products are designed to be easy to use and provide businesses with the tools they need to succeed online. Google's advertising products are supported by a team of experts who provide businesses with the guidance and support they need to succeed. Google's advertising products are constantly evolving to meet the changing needs of businesses. Google's advertising products are designed to be transparent and provide businesses with the information they need to make informed decisions. Google's advertising products are designed to be fair and provide businesses with the opportunity to succeed. Google's advertising products are designed to be accountable and provide businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness of their advertising campaigns. Google's advertising products are designed to be sustainable and provide businesses with the opportunity to succeed while also contributing to the betterment of society. Google's advertising products are designed to be responsible and provide businesses with the opportunity to succeed while also respecting the privacy and security of their users. Google's advertising products are designed to be innovative and provide businesses with the opportunity to succeed while also pushing the boundaries of what is possible. Google's advertising products are designed to be user-centric and provide businesses with the opportunity to succeed while also providing their users with a great experience. Google's advertising products are designed to be accessible and provide businesses with the opportunity to succeed while also making it easy for businesses to reach their target audience. Google's advertising products are designed to be inclusive and provide businesses with the opportunity to succeed while also providing opportunities for businesses of all sizes to succeed online. Google's advertising products are designed to be collaborative and provide businesses with the opportunity to succeed while also working with businesses to help them succeed. Google's advertising products are designed to be accountable and provide businesses with the opportunity to succeed while also providing businesses with the tools they need to measure the effectiveness", - "WEBSEARCH_details": "", - "sources": [ - { - "title": "History of Google - Wikipedia", - "url": "https://en.wikipedia.org/wiki/History_of_Google" - }, - { - "title": "How we started and where we are today - Google - Google - About Google", - "url": "https://about.google/company-info/our-story/" - }, - { - "title": "Google | History & Facts; Products & Services | Britannica Money", - "url": "https://www.britannica.com/money/Google-Inc" - }, - { - "title": "Larry Page | Biography, Google, & Facts | Britannica Money", - "url": "https://www.britannica.com/money/Larry-Page" - }, - { - "title": "Who Owns Google - Google Founders, Current CEO, & History", - "url": "https://www.feedough.com/who-owns-google-google-founders-current-ceo-history/" - }, - { - "title": "The History of Google and How It Was Invented - ThoughtCo", - "url": "https://www.thoughtco.com/who-invented-google-1991852" - }, - { - "title": "Who Invented Google? The Story Behind The Search Engine", - "url": "https://historycooperative.org/who-invented-google/" - }, - { - "title": "Sergey Brin - Education, Google & Wife - Biography", - "url": "https://www.biography.com/business-leaders/sergey-brin" - }, - { - "title": "Who Actually Invented Google and When? The Full, Fascinating History", - "url": "https://www.historytools.org/companies/who-actually-invented-google-and-when" - }, - { - "title": "Who Founded Google? - WorldAtlas", - "url": "https://www.worldatlas.com/articles/who-founded-google.html" - }, - { - "title": "The Complete History and Evolution of Google", - "url": "https://www.historytools.org/companies/google-guide" - } - ] - }, - { - "query": "Where is the Eiffel Tower located?", - "RAG_summary": "The Eiffel Tower is located in Paris, France.", - "WEB_RAG_summary": "The Eiffel Tower is located in Paris, France. It is one of the most famous landmarks in the world and is known for its unique design and stunning views of the city. The tower was built in 1889 for the World's Fair and was intended to be a temporary structure. However, it quickly became a symbol of Paris and has since been a popular tourist attraction. The tower is 324 meters tall and is made of wrought iron. It is also home to a restaurant and observation deck. 2.", - "WEBSEARCH_details": "The Eiffel Tower is located on the Champ de Mars in Paris, France. It is situated on the Seine River and is easily accessible by public transportation. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for tourists and locals alike. The tower is open to visitors year-round and offers stunning views of the city. Visitors can take an elevator to the top of the tower or climb the stairs. The tower is also home to a restaurant and observation deck. The Eiffel Tower is a popular tourist attraction and is visited by millions of people each year. It is considered one of the most iconic landmarks in the world and is a symbol of Paris. The tower is also a popular spot for proposals and weddings. The Eiffel Tower has been featured in many movies and TV shows and is a popular subject for photography. The tower is also home to a museum that tells the story of its construction and history. The Eiffel Tower is a must-see attraction for anyone visiting Paris. It is a symbol of the city and is a popular spot for", - "sources": [ - { - "title": "Eiffel Tower - Wikipedia", - "url": "https://en.wikipedia.org/wiki/Eiffel_Tower" - }, - { - "title": "Eiffel Tower | History, Height, & Facts | Britannica", - "url": "https://www.britannica.com/topic/Eiffel-Tower-Paris-France" - }, - { - "title": "Where is the Eiffel Tower? - World In Paris", - "url": "https://worldinparis.com/where-is-the-eiffel-tower" - }, - { - "title": "Where is Eiffel Tower Located? France", - "url": "https://www.whereig.com/wonders-of-the-world/where-is-eiffel-tower.html" - }, - { - "title": "The Eiffel Tower: all there is to know - Official website", - "url": "https://www.toureiffel.paris/en/the-monument" - }, - { - "title": "Where is The Eiffel Tower on the map? Exact location of The Eiffel ...", - "url": "https://www.city2map.com/en/where-is/eiffeltower.html" - }, - { - "title": "Discovering the Heart of Paris: Where is the Eiffel Tower Located?", - "url": "https://francefocusguide.com/blog/eiffel-tower-location-paris/" - }, - { - "title": "Location of the Eiffel tower - Wonders of the world", - "url": "https://www.wonders-of-the-world.net/Eiffel-Tower/Location-of-the-Eiffel-tower.php" - }, - { - "title": "Eiffel Tower Location In Paris: Access, Views ... - TravelPander", - "url": "https://travelpander.com/eiffel-tower-location-in-paris/" - }, - { - "title": "Eiffel Tower Location | Fastest, Cheapest & Scenic Options", - "url": "https://www.paristickets.com/eiffel-tower/location-getting-there-directions/" - }, - { - "title": "Eiffel Tower Location: A Parisian Icon in the Heart of France", - "url": "https://geographypin.com/eiffel-tower-location/" - }, - { - "title": "The Eiffel Tower - History and Facts | History Hit", - "url": "https://www.historyhit.com/locations/the-eiffel-tower/" - }, - { - "title": "Eiffel Tower: An Architecture Landmark to Visit in Paris", - "url": "https://www.architecturelab.net/architecture/landmark/eiffel-tower/" - }, - { - "title": "Information and facts about the Eiffel Tower", - "url": "https://www.eiffeltower-paris.info/information-and-facts/" - } - ] - }, - { - "query": "When will the artificial intelligence conference be held?", - "RAG_summary": "2024", - "WEB_RAG_summary": "The artificial intelligence conference will be held in 2025. The conference will be held in Chicago, IL, at Northwestern University. The conference is organized by the Association for the Advancement of Artificial Intelligence (AAAI). The conference will be held in 2025, and the exact date is not yet known. The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.", - "WEBSEARCH_details": "The 39th Annual AAAI Conference on Artificial Intelligence will be held in 2025. The conference will be held in Chicago, IL, at Northwestern University. The conference is organized by the Association for the Advancement of Artificial Intelligence (AAAI). The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.\nThe Top AI Conferences To Attend In 2025 is a list of the top AI conferences to attend in 2025. The list includes the International Conference on Artificial Intelligence (ICAI), the International Joint Conference on Artificial Intelligence (IJCAI), the Conference on Neural Information Processing Systems (NeurIPS), and the Conference on Computer Vision and Pattern Recognition (CVPR). The list also includes the International Conference on Machine Learning (ICML), the Conference on Human Factors in Computing Systems (CHI), and the Conference on Artificial Intelligence (AAAI).\nThe Best Artificial Intelligence Conferences & Events of 2025 is a list of the best AI conferences and events to attend in 2025. The list includes the International Conference on Artificial Intelligence (ICAI), the International Joint Conference on Artificial Intelligence (IJCAI), the Conference on Neural Information Processing Systems (NeurIPS), and the Conference on Computer Vision and Pattern Recognition (CVPR). The list also includes the International Conference on Machine Learning (ICML), the Conference on Human Factors in Computing Systems (CHI), and the Conference on Artificial Intelligence (AAAI).\nThe Future of AI: Top Global Conferences to Watch from 2025 to 2026 is a list of the top global AI conferences to watch from 2025 to 2026. The list includes the International Conference on Artificial Intelligence (ICAI), the International Joint Conference on Artificial Intelligence (IJCAI), the Conference on Neural Information Processing Systems (NeurIPS), and the Conference on Computer Vision and Pattern Recognition (CVPR). The list also includes the International Conference on Machine Learning (ICML), the Conference on Human Factors in Computing Systems (CHI), and the Conference on Artificial Intelligence (AAAI).\nIAIC2024 is an international conference on artificial intelligence that will be held in 2024. The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.\nThe 2025 MIT AI Conference is an annual conference on artificial intelligence that will be held in 2025. The conference will cover various topics related to artificial intelligence, including machine learning, natural language processing, computer vision, and robotics. The conference will also feature keynote speakers, workshops, and tutorials. The conference is expected to attract researchers, practitioners, and students from around the world.", - "sources": [ - { - "title": "ICAIL 2025 | Chicago, IL - Northwestern University", - "url": "https://sites.northwestern.edu/icail2025/" - }, - { - "title": "The 39th Annual AAAI Conference on Artificial Intelligence", - "url": "https://aaai.org/conference/aaai/aaai-25/" - }, - { - "title": "The Top AI Conferences To Attend In 2025 - oxfordabstracts.com", - "url": "https://oxfordabstracts.com/blog/top-ai-conferences-to-attend/" - }, - { - "title": "The Best Artificial Intelligence Conferences & Events of 2025", - "url": "https://www.splunk.com/en_us/blog/learn/ai-artificial-intelligence-conferences-events.html" - }, - { - "title": "The Future of AI: Top Global Conferences to Watch from 2025 to 2026", - "url": "https://dscnextcon.com/the-future-of-ai-top-global-conferences-to-watch-from-2025-to-2026/" - }, - { - "title": "2025 MIT AI Conference | ILP", - "url": "https://ilp.mit.edu/AI25" - }, - { - "title": "IAIC2024", - "url": "http://2024.iaicconf.com/" - }, - { - "title": "Association for the Advancement of Artificial Intelligence AAAI Annual ...", - "url": "https://www.paconvention.com/events/detail/association-for-the-advancement-of-artificial-intelligence-aaai-annual-conference-25016" - }, - { - "title": "Artificial Intelligence Conference | Machine Learning Meetings ...", - "url": "https://cognitionconferences.com/artificialintelligence/" - }, - { - "title": "Major AI Events 2025 - Airmeet", - "url": "https://www.airmeet.com/hub/blog/major-ai-events/" - }, - { - "title": "The Biggest AI Conferences & Events Happening in 2025", - "url": "https://engine.com/business-travel-guide/biggest-ai-expos-events-conferences-2025" - }, - { - "title": "AI Conferences 2025 : Agenda + List of Events and Exhibitions", - "url": "https://www.aixploria.com/en/worldwide-top-ai-conferences-agenda/" - }, - { - "title": "Top Artificial Intelligence Conferences 2025", - "url": "https://academicworldresearch.org/blog/top-artificial-intelligence-conferences/" - } - ] - } -] \ No newline at end of file diff --git a/examples/websearchRAG/config.yaml b/examples/websearchRAG/config.yaml index 2a9a60d7..deb764a3 100644 --- a/examples/websearchRAG/config.yaml +++ b/examples/websearchRAG/config.yaml @@ -10,4 +10,5 @@ websearch: max_searches: 10 llm_config: llm_name: OpenMeditron/meditron3-8b - max_new_tokens: 250 \ No newline at end of file + max_new_tokens: 250 + mode: local \ No newline at end of file diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index e8408540..d8b2be24 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -4,27 +4,24 @@ import logging import time import torch -from dataclasses import dataclass -from typing import Any, Dict +from dataclasses import dataclass, asdict, fields +from typing import Any, Dict, Optional, Union, cast +import uvicorn +from fastapi import FastAPI +from langserve import add_routes -from .websearchRAG.logging_config import logger # Import the shared logger - +from .websearchRAG.logging_config import logger from .utils import load_config from .websearchRAG.config import WebsearchConfig from .websearchRAG.pipeline import WebsearchPipeline +from .run_rag import LocalConfig, APIConfig, create_api +from .websearchRAG.logging_config import logger # Import the shared logger -# WEBSRCH_EMOJI = "🌐" -# logger = logging.getLogger(__name__) -# logging.basicConfig( -# format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", -# level=logging.INFO, -# datefmt="%Y-%m-%d %H:%M:%S", -# ) -# CUDA tweaks (as before) +# CUDA tweaks for best perf torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_math_sdp(True) @@ -35,52 +32,74 @@ class WebsearchSection: use_rag: bool rag_config_path: str use_summary: bool - n_subqueries : int + n_subqueries: int input_file: str input_queries: str output_file: str n_loops: int max_searches: int llm_config: Dict[str, Any] + mode: str @dataclass -class WebsearchAppConfig: +class WebsearchInferenceConfig: websearch: WebsearchSection + mode: str = "local" # "local" or "api" + mode_args: Optional[Union[LocalConfig, APIConfig]] = None + + def __post_init__(self): + if self.mode == "api" and self.mode_args is None: + self.mode_args = APIConfig() + + +def build_pipeline(ws: WebsearchSection) -> WebsearchPipeline: + ws_dict = asdict(ws) + config_fields = {f.name for f in fields(WebsearchConfig)} + filtered_dict = {k: v for k, v in ws_dict.items() if k in config_fields} + web_cfg = WebsearchConfig(**filtered_dict) + return WebsearchPipeline(config=web_cfg) + +def run_websearch(config_file): + + # 1) Load config + cfg = load_config(config_file, WebsearchInferenceConfig) + ws = cfg.websearch + #logger.info("Configuration file", ws) + if not cfg.mode: + raise ValueError("Configuration is missing the 'mode' field. Ensure it is set to 'local' or 'api'.") + + + # 2) Build pipeline once + # web_cfg = WebsearchConfig(**asdict(ws)) + # pipeline = WebsearchPipeline(config=web_cfg) + pipeline = build_pipeline(ws) + + # 3) Dispatch on mode + if cfg.mode == "local": + logger.info("Running Websearch pipeline in LOCAL mode...") + start = time.time() + pipeline.run() + logger.info(f"Completed in {time.time() - start:.2f}s") + + elif cfg.mode == "api": + logger.info("Starting Websearch pipeline in API mode...") + # wrap pipeline.__call__ via create_api + app: FastAPI = create_api(pipeline, cfg.mode_args.endpoint) + + @app.get("/health") + def _health(): + return {"status": "healthy"} + uvicorn.run( + app, + host=cfg.mode_args.host, + port=cfg.mode_args.port, + log_level="info", + ) -def run_websearch(config_file: str): - """ - Run the Websearch (+ optional RAG) pipeline according to the YAML at config_file. - """ - logger.info(f"Websearch configuration file: {config_file}") - - # 1) Load and parse the YAML using the wrapper - app_cfg = load_config(config_file, WebsearchAppConfig) - ws = app_cfg.websearch - logger.info(f"Parsed Websearch section: {ws}") - - # 2) Map to the pipeline's config dict - web_cfg_dict = { - "use_rag": ws.use_rag, - "rag_config_path": ws.rag_config_path, - "rag_summary": ws.use_summary, - "input_file": ws.input_file, - "input_queries": ws.input_queries, - "output_file": ws.output_file, - "n_subqueries": ws.n_subqueries, - "n_loops": ws.n_loops, - "max_searches": ws.max_searches, - "llm_config": ws.llm_config, - } - web_cfg = WebsearchConfig.from_dict(web_cfg_dict) - logger.info(f"Using WebsearchConfig: {web_cfg}") - - # 3) Instantiate and run - pipeline = WebsearchPipeline(config=web_cfg) - start = time.time() - pipeline.run() - logger.info(f"Websearch pipeline completed in {time.time() - start:.2f} seconds.") + else: + raise ValueError(f"Unknown mode: {cfg.mode!r}. Must be 'local' or 'api'.") if __name__ == "__main__": @@ -92,4 +111,4 @@ def run_websearch(config_file: str): help="Path to the Websearch configuration file (YAML)." ) args = parser.parse_args() - run_websearch(args.config_file) + run_websearch(args.config_file) \ No newline at end of file diff --git a/src/mmore/websearch/__init__.py b/src/mmore/websearch/__init__.py deleted file mode 100644 index 79863ad7..00000000 --- a/src/mmore/websearch/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Websearch module for enhancing RAG outputs with web search.""" - -from .pipeline import WebsearchPipeline -from .config import WebsearchConfig - -__all__ = ["WebsearchPipeline", "WebsearchConfig"] \ No newline at end of file diff --git a/src/mmore/websearch/config.py b/src/mmore/websearch/config.py deleted file mode 100644 index d56d22bd..00000000 --- a/src/mmore/websearch/config.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Configuration for websearch pipeline.""" -from dataclasses import dataclass -from typing import Dict, Any - -from .llm import LLMConfig - -@dataclass -class WebsearchConfig: - """Configuration for websearch pipeline.""" - input_file: str - output_file: str - n_loops: int = 2 - llm_name: str = "OpenMeditron/meditron3-8b" - max_searches: int = 10 - llm_config: Dict[str, Any] = None - - def __post_init__(self): - if self.llm_config is None: - self.llm_config = { - "max_new_tokens": 1000, - "temperature": 0.7 - } - - @classmethod - def from_dict(cls, config: Dict[str, Any]) -> "WebsearchConfig": - """Create config from dictionary.""" - return cls( - input_file=config["input_file"], - output_file=config["output_file"], - n_loops=config.get("n_loops", 2), - llm_name=config.get("llm_name", "OpenMeditron/meditron3-8b"), - max_searches=config.get("max_searches", 10), - llm_config=config.get("llm_config", None) - ) - - def get_llm_config(self) -> LLMConfig: - """Get LLM configuration.""" - return LLMConfig( - llm_name=self.llm_name, - max_new_tokens=self.llm_config["max_new_tokens"], - temperature=self.llm_config["temperature"] - ) \ No newline at end of file diff --git a/src/mmore/websearch/llm.py b/src/mmore/websearch/llm.py deleted file mode 100644 index de8acfe1..00000000 --- a/src/mmore/websearch/llm.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Simple LLM implementation for websearch.""" -from dataclasses import dataclass - -from transformers import AutoModelForCausalLM, AutoTokenizer -import torch -from langchain_core.messages import BaseMessage - -@dataclass -class LLMConfig: - """Simple LLM configuration.""" - llm_name: str - max_new_tokens: int = 2000 - temperature: float = 0.2 - -class LLM: - """Simple LLM wrapper for websearch.""" - def __init__(self, model, tokenizer, config: LLMConfig): - self.model = model - self.tokenizer = tokenizer - self.config = config - - @classmethod - def from_config(cls, config: LLMConfig): - """Create LLM from configuration.""" - tokenizer = AutoTokenizer.from_pretrained(config.llm_name) - model = AutoModelForCausalLM.from_pretrained( - config.llm_name, - torch_dtype=torch.float16, - device_map="auto" - ) - return cls(model, tokenizer, config) - - def invoke(self, messages: list[BaseMessage]) -> BaseMessage: - """Generate response for the given messages.""" - # Convert messages to prompt - prompt = "" - for msg in messages: - if msg.type == "system": - prompt += f"System: {msg.content}\n" - elif msg.type == "human": - prompt += f"Human: {msg.content}\n" - elif msg.type == "assistant": - prompt += f"Assistant: {msg.content}\n" - prompt += "Assistant: " - - # Generate - inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) - outputs = self.model.generate( - **inputs, - max_new_tokens=self.config.max_new_tokens, - temperature=self.config.temperature, - do_sample=True, - pad_token_id=self.tokenizer.eos_token_id - ) - response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) - - return BaseMessage(content=response, type="assistant") \ No newline at end of file diff --git a/src/mmore/websearch/pipeline.py b/src/mmore/websearch/pipeline.py deleted file mode 100644 index 8b073e1c..00000000 --- a/src/mmore/websearch/pipeline.py +++ /dev/null @@ -1,234 +0,0 @@ -import json -import re -from pathlib import Path -from typing import Dict, List, Any, Optional, Set - -from duckduckgo_search import DDGS -from langchain_core.messages import HumanMessage, SystemMessage - -from .llm import LLM -from .config import WebsearchConfig - - -class WebsearchPipeline: - """Pipeline for enhancing RAG outputs with web search.""" - - def __init__(self, config: WebsearchConfig): - """Initialize pipeline.""" - self.config = config - self.llm = LLM.from_config(config.get_llm_config()) - - @staticmethod - def clean_llm_output(text: str) -> str: - """Remove internal model tokens.""" - return re.sub(r'<\|.*?\|>', '', text).strip() - - @staticmethod - def extract_llm_answer(raw_response: str) -> str: - """Extract the answer following the 'Answer:' prefix from the LLM response.""" - raw_response = WebsearchPipeline.clean_llm_output(raw_response) - match = re.search(r'Answer:(.*)', raw_response, re.DOTALL | re.IGNORECASE) - if match: - return match.group(1).strip() - return raw_response - - def generate_summary(self, rag_answer: str, query: str) -> str: - """Build the prompt, invoke the LLM, and extract the answer.""" - prompt = ( - "You have only the following context to answer the question—do not use any external knowledge.\n\n" - f"Question: {query}\n\n" - "Context:\n" - f"{rag_answer}\n\n" - "If the context contains the answer or useful information, respond with that information. " - "Answer:" - ) - - messages = [ - SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), - HumanMessage(content=prompt), - ] - response = self.llm.invoke(messages) - return self.extract_llm_answer(response.content) - - @staticmethod - def extract_query_from_llm_output(text: str) -> str: - """Extract everything between tags, picking longest match.""" - matches = re.findall(r'<\s*question\s*>(.*?)<\s*/\s*question\s*>', text, re.DOTALL | re.IGNORECASE) - return max(matches, key=len).strip() if matches else "" - - @staticmethod - def validate_search_query(query: str, original_query: str) -> str: - """Fallback to default if empty or >30 words.""" - return query if query and len(query.split()) <= 30 else original_query - - @staticmethod - def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: - """Perform DuckDuckGo search.""" - try: - with DDGS() as ddgs: - return [ - {'title': r.get('title'), 'url': r.get('href')} - for r in ddgs.text(query, max_results=max_results) - ] - except Exception as e: - print(f"Search error: {e}") - return [] - - def generate_search_query(self, original_query: str, rag_answer: str, previous_analysis: Optional[str] = None) -> str: - """Generate and extract query enclosed in tags.""" - base = f"Based on:\n- Original Query: '{original_query}'\n- RAG Answer: '{rag_answer}'" - if previous_analysis: - base += f"\n- Previous Findings: {previous_analysis}" - prompt = ( - f"{base}\n" - "Generate a concise search query (up to 30 words) that either directly answers the original question " - "or complements previous findings by seeking missing or updated information. " - "Enclose your answer within tags." - ) - messages = [SystemMessage(content="You are a search query generator."), HumanMessage(content=prompt)] - response = self.llm.invoke(messages) - - raw = self.clean_llm_output(response.content) - extracted = self.extract_query_from_llm_output(raw) - return self.validate_search_query(' '.join(extracted.split()), original_query) - - def analyze_search_results(self, original_query: str, rag_answer: str, results: List[Dict[str, str]]) -> str: - """Analyze search results and create a combined summary with RAG and web information.""" - collated = '\n\n'.join([f"Source: {r['title']}\n{r.get('body', '')}" for r in results]) - prompt = ( - f"Original Query: {original_query}\n" - f"Current Knowledge (RAG): {rag_answer}\n" - "New Information from Web:\n" f"{collated}\n\n" - "Provide a detailed analysis that combines the RAG knowledge with the new web information. " - "Your response should:\n" - "1. Integrate both RAG and web information comprehensively\n" - "2. Include specific details, facts, and findings\n" - "3. Highlight important updates or corrections from the web sources\n" - "Structure your response as follows in the tags :\n" - "- Summary: A summary of all key points from the RAG and the web sources responding directly to the query\n" - "- More detailed informations: Any additional useful information more detailed\n" - "and outside of the tags:\n" - "ADDITIONAL GAPS:\n" - "- [List any remaining questions or areas needing more research in order to improve the answer of the original query]" - ) - messages = [ - SystemMessage(content="You are a research analyst focused on providing detailed, comprehensive analysis."), - HumanMessage(content=prompt), - ] - response = self.llm.invoke(messages) - return self.clean_llm_output(response.content) - - @staticmethod - def extract_enhanced_answer(text: str) -> Optional[Dict[str, str]]: - """Extract the content between enhanced_answer tags, focusing on the last occurrence.""" - try: - pattern = r'\s*(.*?)\s*' - matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE) - - if not matches: - # fallback extraction if tags missing - pattern_sum = r'Summary:(.*?)(?:More detailed informations:|ADDITIONAL GAPS:|$)' - summary_matches = re.findall(pattern_sum, text, re.DOTALL | re.IGNORECASE) - - pattern_det = r'More detailed informations:(.*?)(?:ADDITIONAL GAPS:|$)' - details_matches = re.findall(pattern_det, text, re.DOTALL | re.IGNORECASE) - - if summary_matches or details_matches: - return { - 'summary': summary_matches[-1].strip() if summary_matches else "", - 'details': details_matches[-1].strip() if details_matches else "" - } - return None - - extracted_content = matches[-1].strip() - - summary_lines = [] - details_lines = [] - current_section = None - - for line in extracted_content.splitlines(): - line = line.strip() - if not line or line.startswith('Source:') or line.startswith('New Information'): - continue - - if line.lower().startswith('summary:'): - current_section = 'summary' - continue - elif line.lower().startswith('more detailed informations'): - current_section = 'details' - continue - - if current_section == 'summary': - summary_lines.append(line) - elif current_section == 'details': - details_lines.append(line) - - return { - 'summary': '\n'.join(summary_lines).strip(), - 'details': '\n'.join(details_lines).strip() - } - except Exception as e: - print(f"Warning: Error during extraction: {e}") - return None - - def process_record(self, record: Dict[str, Any]) -> Dict[str, Any]: - """Process a single JSON record with web search enhancement.""" - user_query = record.get('input', '') - initial_rag_answer = record.get('answer', '') - - # Generate initial summary - initial_summary = self.generate_summary(initial_rag_answer, user_query) - - previous_analysis = None - all_sources: List[Dict[str, str]] = [] - seen_urls: Set[str] = set() - current_knowledge = initial_summary - final_answer = None - - for i in range(1, self.config.n_loops + 1): - print(f"Loop {i} of {self.config.n_loops}") - query = self.generate_search_query(user_query, current_knowledge, previous_analysis) - print(f"Generated query: {query}, based on the original query: {user_query}") - - results = self.duckduckgo_search(query, self.config.max_searches) - - # Add only new sources (de-dup by url) - for r in results: - if r['url'] not in seen_urls: - all_sources.append({'title': r['title'], 'url': r['url']}) - seen_urls.add(r['url']) - - analysis = self.analyze_search_results(user_query, current_knowledge, results) - - extracted_answer = self.extract_enhanced_answer(analysis) - if extracted_answer: - current_knowledge = extracted_answer.get('summary', '') + '\n' + extracted_answer.get('details', '') - final_answer = extracted_answer - else: - # fallback to full analysis if no extraction - current_knowledge = analysis - - previous_analysis = analysis - - return { - 'query': user_query, - 'RAG_summary': initial_summary, - 'WEB_RAG_summary': final_answer.get('summary', '') if final_answer else "", - 'WEBSEARCH_details': final_answer.get('details', '') if final_answer else "", - 'sources': all_sources, - } - - def run(self): - """Run the websearch pipeline.""" - with open(self.config.input_file, 'r', encoding='utf-8') as f: - data = json.load(f) - - results = [self.process_record(record) for record in data] - - output_path = Path(self.config.output_file) - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(results, f, indent=2, ensure_ascii=False) - - print(f"\nResults saved to {output_path}") diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index ce23d376..950bb744 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -1,6 +1,6 @@ # mmore/websearch/config.py -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Dict from pathlib import Path import yaml @@ -16,7 +16,7 @@ class WebsearchConfig: Fields: rag_config_path: (str or None) Path to the RAG config YAML. Required if use_rag=True. use_rag: (bool) If True, run RAG first; otherwise skip directly to sub-query generation. - rag_summary: (bool) If True, run an initial LLM-based summary of the RAG answer. + use_summary: (bool) If True, run an initial LLM-based summary of the RAG answer. input_file: (str) Path to the JSON file used as “queries” (or RAG output). output_file: (str) Path where the enhanced JSON results will be written. n_subqueries: (int) Number of sub-queries to generate via LLM. @@ -25,36 +25,23 @@ class WebsearchConfig: """ rag_config_path: str # e.g. "../rag/config.yaml" - use_rag: bool - rag_summary: bool input_file: str input_queries: str output_file: str - n_subqueries: int - n_loops : int - max_searches: int - llm_config: Dict[str, Any] + mode : str + llm_config: LLMConfig = field(default_factory=lambda: LLMConfig(llm_name="gpt-4")) + use_rag: bool = True + use_summary: bool = False + n_subqueries: int = 3 + n_loops : int = 2 + max_searches: int = 10 + + def __post_init__(self): + required_fields = ["rag_config_path", "input_file", "output_file", "mode"] + for field in required_fields: + if not getattr(self, field): + raise ValueError(f"'{field}' is a required field.") - @staticmethod - def from_dict(d: Dict[str, Any]) -> "WebsearchConfig": - # Validate required keys - required = ["use_rag", "rag_summary", "input_file", "output_file", "n_loops", "n_subqueries", "max_searches", "llm_config"] - for key in required: - if key not in d: - raise ValueError(f"Missing '{key}' in WebsearchConfig.") - rag_config_path = d.get("rag_config_path", "") - return WebsearchConfig( - rag_config_path=rag_config_path, - use_rag=d["use_rag"], - rag_summary=d["rag_summary"], - input_file=d["input_file"], - input_queries=d["input_queries"], - output_file=d["output_file"], - n_loops=d["n_loops"], - n_subqueries=int(d["n_subqueries"]), - max_searches=int(d["max_searches"]), - llm_config=d["llm_config"], - ) def get_llm_config(self) -> LLMConfig: """ diff --git a/src/mmore/websearchRAG/llm.py b/src/mmore/websearchRAG/llm.py deleted file mode 100644 index 7e5344db..00000000 --- a/src/mmore/websearchRAG/llm.py +++ /dev/null @@ -1,41 +0,0 @@ - - - - - -def generate_query_scratch(query): - return "query" - - -def resume_from_RAG(query, RAG_output): - return "query" - - -def define_n_queries_based_on_RAG(query, RAG_output): - return "query" - - -def resume_web_search(query, web_search_output): - return "query" - - -def resume_all_web_search(query, web_search_output): - return "query" - - -def resume_all_web_search_and_RAG_summary(query, RAG_output_summary, web_search_output): - return "query" - -def resume_all_web_search_and_RAG_full(query, RAG_output_summary, web_search_output): - return "query" - - -def keep_only_the_most_relevant_web_search(query, web_search_output): - return "query" - - - -def provide_final_answer(): - return answer_short, answer_long - - diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index cb3c286a..27141840 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -8,6 +8,7 @@ import time from langchain_community.tools import DuckDuckGoSearchResults +from duckduckgo_search.exceptions import RatelimitException from langchain_community.utilities import DuckDuckGoSearchAPIWrapper @@ -35,6 +36,7 @@ def __init__(self, config: WebsearchConfig): self.llm = self._initialize_llm() self.rag_results: Optional[List[Dict[str, Any]]] = None + def _initialize_llm(self) -> LLM: if self.config.use_rag is True: rag_cfg = self.config.access_rag_config() @@ -53,7 +55,7 @@ def _initialize_llm(self) -> LLM: def generate_summary(self, rag_answer: str, query: str) -> str: """ - Summarize the RAG answer (used when rag_summary=True). + Summarize the RAG answer (used when rag_summary=True) """ prompt = ( "You have only the following context to answer the question, do not use any external knowledge.\n\n" @@ -62,7 +64,7 @@ def generate_summary(self, rag_answer: str, query: str) -> str: f"{rag_answer}\n\n" "If the context contains the answer or any useful information, respond with that information. \n" "If no useful informations are, answer: no useful informations" - "Answer:" + "Answer: \n" "---------------------------" ) @@ -75,39 +77,24 @@ def generate_summary(self, rag_answer: str, query: str) -> str: ] response = self.llm.invoke(messages) print("##SUMMARY CLEAN##") - print(self._clean_section(response.content)) - return self._clean_section(response.content) + print(self._clean_llm_output(response.content)) + return self._clean_llm_output(response.content) - def _clean_section(self, content: str) -> str: - delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - subquery_section = content.split(delimiter)[-1].strip() - subquery_section = subquery_section.lower().strip() - print("##Current Response##") - print(subquery_section) - print("##") - return subquery_section - @staticmethod - def is_useful(text: str) -> bool: - t = text.strip().lower() - if not t or t.startswith("i don't know") or t.startswith("no"): - return False - return True - - def clean_llm_output(self, content): - # Define the delimiter after which the subqueries are located + + def _clean_llm_output(self, content: str): delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + + if delimiter not in content: + return [] if extract_subqueries else "" + + # Extract the section after the delimiter + cleaned_section = content.split(delimiter, 1)[-1].lower().strip() + + return cleaned_section + - # Split the content based on the delimiter - if delimiter in content: - subquery_section = content.split(delimiter, 1)[-1] - # Use regex to extract lines matching the subquery format - subquery_section = subquery_section.lower().strip() - subqueries = re.findall(r"subquery \d+: (.*)", subquery_section.strip()) - return subqueries - else: - return [] def generate_subqueries( self, @@ -140,44 +127,37 @@ def generate_subqueries( ] response = self.llm.invoke(messages) - print("######") - print("Clean response: ", self.clean_llm_output(response.content)) - print("--------------------") - return self.clean_llm_output(response.content) - - # @staticmethod - # def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: - # time.sleep(2) - # try: - # with DDGS() as ddgs: - # print("query:", query) - # results = ddgs.text(query, max_results=max_results) - # return [{"title": r.get("title", ""), "url": r.get("href", "")} for r in results] - # except Exception as e: - # logger.error(f"DuckDuckGo error: {e}") - # return [] + cleaned_answer = self._clean_llm_output(response.content) + cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) + # print("######") + # print("Clean response: ", cleaned_answer) + # print("--------------------") + return cleaned_answer + + + @staticmethod def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: """ - Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper. + Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper - Returns a list of dicts with keys: 'title' and 'url'. + Returns a list of dicts with keys: 'title' and 'url' """ - time.sleep(2) # polite delay + time.sleep(2) # delay to try to avoid error 202 ### TO BE IMPROVED ### try: - wrapper = DuckDuckGoSearchAPIWrapper(max_results=max_results) + wrapper = DuckDuckGoSearchAPIWrapper(max_results=max_results, backend='auto') search = DuckDuckGoSearchResults(api_wrapper=wrapper, output_format="list") - # Use run() method with output_format="list" to get list of dicts + results = search.invoke(query) - # Each item is expected to have keys like: 'title', 'link', 'snippet' - # Map 'link' to 'url' for compatibility with existing code + formatted_results = [] for r in results: snippet = r.get("snippet", "") url = r.get("link", "") # note: it's "link" in LangChain results if url: formatted_results.append({"snippet": snippet, "url": url}) + print("Websearch", formatted_results) return formatted_results except Exception as e: logger.error(f"DuckDuckGo search error: {e}") @@ -203,7 +183,7 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st msgs = [SystemMessage(content="You are a research assistant."), HumanMessage(content=prompt)] resp = self.llm.invoke(msgs) # parse - clean_content = self._clean_section(resp.content) + clean_content = self._clean_llm_output(resp.content) sa_matches = re.findall( r"short answer:\s*(.*?)(?=detailed answer:)", @@ -222,8 +202,6 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st - - def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: qr = rec.get("input", "").strip() rag_ans = rec.get("answer", "") if self.config.use_rag else "" @@ -259,7 +237,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: subquery_snippets.append(snippet) # Summarize each subquery's snippets independently if rag_summary is True - if self.config.rag_summary: + if self.config.use_summary: if subquery_snippets: combined_snippets = "\n".join(subquery_snippets) summary = self.generate_summary(combined_snippets, sq) @@ -267,13 +245,13 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: else: subquery_summaries.append("") - if self.config.rag_summary: + if self.config.use_summary: combined_sub_summaries = "\n".join([str(s) if s else "" for s in subquery_summaries]) web_summary = self.generate_summary(combined_sub_summaries, qr) web_summaries.append(web_summary) #print("Current websummary: ", web_summary) - # Combine rag summary, web summary, and original query for final integration + # Combine rag summary, web summary, and original query for final answer context_for_llm = f"RAG informations:\n{rag_summary or ''}\n\nWeb informations:\n{web_summary}" else: # If not summarizing subqueries, use rag summary or current context with snippets @@ -282,11 +260,11 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: combined_web_summaries = "\n".join([str(s) if s else "" for s in web_summaries]) web_summary_all = self.generate_summary(combined_web_summaries, qr) - # Integrate all info with LLM + # Current context, web content to generate the answer out = self.integrate_with_llm(qr, context_for_llm, snippets) final_short, final_detailed = out["short"], out["detailed"] - # Prepare context for next loop iteration + # Prepare context for next search loop current_context = final_detailed @@ -294,7 +272,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: return { "query": qr, "rag_summary": rag_summary if self.config.use_rag else None, - "web_summary": web_summary_all if self.config.rag_summary else None, + "web_summary": web_summary_all if self.config.use_summary else None, "short_answer": final_short, "detailed_answer": final_detailed, "sources": list(all_sources), From 87aa7fbd8e969ed1e79121b54884adf2f478f24c Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Thu, 19 Jun 2025 14:00:48 -0700 Subject: [PATCH 09/33] websearch function improved --- src/mmore/websearchRAG/pipeline.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 27141840..d24c0691 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -35,22 +35,23 @@ def __init__(self, config: WebsearchConfig): self.config = config self.llm = self._initialize_llm() self.rag_results: Optional[List[Dict[str, Any]]] = None + self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches, backend='auto') + self.search = DuckDuckGoSearchResults(api_wrapper=self.wrapper, output_format="list") def _initialize_llm(self) -> LLM: - if self.config.use_rag is True: + if self.config.use_rag : rag_cfg = self.config.access_rag_config() llm_conf = rag_cfg.get("rag", {}).get("llm") if llm_conf is None: raise ValueError("Missing 'llm' config under 'rag' in RAG configuration.") return LLM.from_config(LLMConfig(**llm_conf)) - elif self.config.use_rag is False: + else : base_conf = self.config.get_llm_config() if isinstance(base_conf, LLMConfig): # Ensure it's a dictionary base_conf = base_conf.__dict__ return LLM.from_config(LLMConfig(**base_conf)) - else: - raise ValueError("Invalid value for 'use_rag'. Must be True or False.") + def generate_summary(self, rag_answer: str, query: str) -> str: @@ -137,8 +138,7 @@ def generate_subqueries( - @staticmethod - def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]]: + def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: """ Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper @@ -146,10 +146,7 @@ def duckduckgo_search(query: str, max_results: int = 10) -> List[Dict[str, str]] """ time.sleep(2) # delay to try to avoid error 202 ### TO BE IMPROVED ### try: - wrapper = DuckDuckGoSearchAPIWrapper(max_results=max_results, backend='auto') - search = DuckDuckGoSearchResults(api_wrapper=wrapper, output_format="list") - - results = search.invoke(query) + results = self.search.invoke(query) formatted_results = [] for r in results: @@ -225,7 +222,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: for sq in subs: #print("subquery:", sq) - res = self.duckduckgo_search(sq, max_results=self.config.max_searches) + res = self.duckduckgo_search(query = sq) subquery_snippets = [] for r in res: From 65c8dbcd85f95353ae88aa6e749f19a66b93ec4c Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Sun, 22 Jun 2025 12:07:08 -0700 Subject: [PATCH 10/33] API mode working + sources output update --- src/mmore/run_websearch.py | 156 +++++++++++++++++++---------- src/mmore/websearchRAG/config.py | 25 ++--- src/mmore/websearchRAG/pipeline.py | 79 +++++++++++++-- 3 files changed, 188 insertions(+), 72 deletions(-) diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index d8b2be24..ac387e82 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -9,99 +9,137 @@ import uvicorn from fastapi import FastAPI +from pydantic import BaseModel, Field from langserve import add_routes +from dotenv import load_dotenv from .websearchRAG.logging_config import logger from .utils import load_config from .websearchRAG.config import WebsearchConfig from .websearchRAG.pipeline import WebsearchPipeline -from .run_rag import LocalConfig, APIConfig, create_api +from .rag.pipeline import RAGConfig, RAGPipeline +from .run_rag import LocalConfig, APIConfig, RAGInferenceConfig + +from .run_rag import create_api as create_api_rag from .websearchRAG.logging_config import logger # Import the shared logger +#à quoi ça sert? +load_dotenv() + + # CUDA tweaks for best perf torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_math_sdp(True) -@dataclass -class WebsearchSection: - use_rag: bool - rag_config_path: str - use_summary: bool - n_subqueries: int - input_file: str - input_queries: str - output_file: str - n_loops: int - max_searches: int - llm_config: Dict[str, Any] - mode: str - @dataclass class WebsearchInferenceConfig: - websearch: WebsearchSection - mode: str = "local" # "local" or "api" + websearch: WebsearchConfig mode_args: Optional[Union[LocalConfig, APIConfig]] = None def __post_init__(self): - if self.mode == "api" and self.mode_args is None: + if self.websearch.mode == "api" and self.mode_args is None: self.mode_args = APIConfig() -def build_pipeline(ws: WebsearchSection) -> WebsearchPipeline: - ws_dict = asdict(ws) - config_fields = {f.name for f in fields(WebsearchConfig)} - filtered_dict = {k: v for k, v in ws_dict.items() if k in config_fields} - web_cfg = WebsearchConfig(**filtered_dict) - return WebsearchPipeline(config=web_cfg) - def run_websearch(config_file): # 1) Load config cfg = load_config(config_file, WebsearchInferenceConfig) - ws = cfg.websearch - #logger.info("Configuration file", ws) - if not cfg.mode: - raise ValueError("Configuration is missing the 'mode' field. Ensure it is set to 'local' or 'api'.") - - - # 2) Build pipeline once - # web_cfg = WebsearchConfig(**asdict(ws)) - # pipeline = WebsearchPipeline(config=web_cfg) - pipeline = build_pipeline(ws) - - # 3) Dispatch on mode - if cfg.mode == "local": + ws = cfg.websearch + if ws.mode == "local": + pipeline = WebsearchPipeline(config=ws) logger.info("Running Websearch pipeline in LOCAL mode...") start = time.time() pipeline.run() logger.info(f"Completed in {time.time() - start:.2f}s") - elif cfg.mode == "api": + elif ws.mode == "api": logger.info("Starting Websearch pipeline in API mode...") - # wrap pipeline.__call__ via create_api - app: FastAPI = create_api(pipeline, cfg.mode_args.endpoint) - - @app.get("/health") - def _health(): - return {"status": "healthy"} - - uvicorn.run( - app, - host=cfg.mode_args.host, - port=cfg.mode_args.port, - log_level="info", - ) + app = create_api(cfg) + uvicorn.run(app, host="0.0.0.0", port=8000) else: raise ValueError(f"Unknown mode: {cfg.mode!r}. Must be 'local' or 'api'.") + +class QueryInput(BaseModel): + input: str = Field(..., description="The user query") + collection_name: Optional[str] = Field( + None, description="The collection to search (optional)" + ) + +class WebQuery(BaseModel): + query: QueryInput = Field( + ..., + description="Search query with input and optional collection name" + ) + use_rag: bool = Field( + False, + description="Include RAG context", + example=True + ) + use_summary: bool = Field( + True, + description="Enable subquery summary", + example=False + ) + + + + + +def create_api(config_file: str): + app = FastAPI( + title="mmore Websearch API", + description="""This API is based on the OpenAPI 3.1 specification. You can find out more about Swagger at [https://swagger.io](https://swagger.io). + +## Overview + +This API defines the retriever API of mmore, handling: + +1. **File Operations** - Direct file management within mmore. +2. **Rag and websearch** - Search based on the query/documents.""", + version="1.0.0", + ) + + logger.info("Websearch loaded!") + + @app.post("/websearch") + # query = query parameter + def websearch(query: WebQuery): + #charge la pipeline directement depuis rag_pp + #changer le config_file avec le config file du rag --> ajouter ce que l'utilisateur demande + pipeline = WebsearchPipeline(config=config_file.websearch) + + if query.use_rag: + logger.info("Launch RAG") + config_RAG = load_config(config_file.websearch.rag_config_path, RAGInferenceConfig) + logger.info("Creating the RAG Pipeline...") + rag_pp = RAGPipeline.from_config(config_RAG.rag) + data = rag_pp([query.query.dict()], return_dict=True) + logger.info("RAG done") + logger.info("##RAG##", data) + else: + data = query.query + + logger.info("Launch websearch") + + answers = pipeline.run_api(query.use_rag, query.use_summary, data) + logger.info("Websearch done") + + + return answers + + return app + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run the Websearch (+ optional RAG) pipeline.") parser.add_argument( @@ -111,4 +149,16 @@ def _health(): help="Path to the Websearch configuration file (YAML)." ) args = parser.parse_args() - run_websearch(args.config_file) \ No newline at end of file + run_websearch(args.config_file) + + + + +# { +# "query": { +# "input": "When was Barack Obama born?", +# "collection_name": "my_docs" +# }, +# "use_rag": true, +# "use_summary": true +# } \ No newline at end of file diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index 950bb744..05461739 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -1,7 +1,7 @@ # mmore/websearch/config.py from dataclasses import dataclass, field -from typing import Any, Dict +from typing import Any, Dict, Optional from pathlib import Path import yaml @@ -19,37 +19,38 @@ class WebsearchConfig: use_summary: (bool) If True, run an initial LLM-based summary of the RAG answer. input_file: (str) Path to the JSON file used as “queries” (or RAG output). output_file: (str) Path where the enhanced JSON results will be written. + input_queries: (str) Path to queries file. n_subqueries: (int) Number of sub-queries to generate via LLM. + n_loops: (int) Number of loops to run the process. max_searches: (int) Max results to fetch from DuckDuckGo per sub-query. llm_config: (dict) Passed to rag.llm.LLMConfig (keys: llm_name, max_new_tokens, temperature, etc.) + mode: (str) Mode of operation ("local" or "api"). """ - rag_config_path: str # e.g. "../rag/config.yaml" - input_file: str - input_queries: str - output_file: str - mode : str - llm_config: LLMConfig = field(default_factory=lambda: LLMConfig(llm_name="gpt-4")) - use_rag: bool = True + rag_config_path: str # e.g., "../rag/config.yaml" + use_rag: bool = False use_summary: bool = False + input_file: Optional[str] = None + input_queries: Optional[str] = None + output_file: Optional[str] = None n_subqueries: int = 3 - n_loops : int = 2 + n_loops: int = 2 max_searches: int = 10 + llm_config: Dict[str, Any] = field(default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 100}) + mode: str = "local" def __post_init__(self): - required_fields = ["rag_config_path", "input_file", "output_file", "mode"] + required_fields = ["rag_config_path", "llm_config", "mode"] for field in required_fields: if not getattr(self, field): raise ValueError(f"'{field}' is a required field.") - def get_llm_config(self) -> LLMConfig: """ Convert the nested llm_config dict into an instance of rag.llm.LLMConfig. """ return LLMConfig(**self.llm_config) - def access_rag_config(self) -> Dict[str, Any]: """ Access and parse the RAG configuration file defined in `rag_config_path`. diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index d24c0691..a754ef01 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -6,6 +6,8 @@ from typing import Dict, List, Any, Optional, Set import logging import time +import tempfile +import os from langchain_community.tools import DuckDuckGoSearchResults from duckduckgo_search.exceptions import RatelimitException @@ -34,7 +36,7 @@ class WebsearchPipeline: def __init__(self, config: WebsearchConfig): self.config = config self.llm = self._initialize_llm() - self.rag_results: Optional[List[Dict[str, Any]]] = None + self.rag_results = None self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches, backend='auto') self.search = DuckDuckGoSearchResults(api_wrapper=self.wrapper, output_format="list") @@ -77,8 +79,8 @@ def generate_summary(self, rag_answer: str, query: str) -> str: HumanMessage(content=prompt), ] response = self.llm.invoke(messages) - print("##SUMMARY CLEAN##") - print(self._clean_llm_output(response.content)) + # print("##SUMMARY CLEAN##") + # print(self._clean_llm_output(response.content)) return self._clean_llm_output(response.content) @@ -152,8 +154,9 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: for r in results: snippet = r.get("snippet", "") url = r.get("link", "") # note: it's "link" in LangChain results + title = r.get("title", "") if url: - formatted_results.append({"snippet": snippet, "url": url}) + formatted_results.append({"snippet": snippet, "url": url, "title" : title}) print("Websearch", formatted_results) return formatted_results except Exception as e: @@ -202,10 +205,11 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: qr = rec.get("input", "").strip() rag_ans = rec.get("answer", "") if self.config.use_rag else "" + self.rag_results = rag_ans rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None all_sources: Set[str] = set() - current_context = rag_summary or "" + current_context = rag_summary final_short, final_detailed = "", "" web_summary = "" @@ -225,9 +229,16 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: res = self.duckduckgo_search(query = sq) subquery_snippets = [] + # for r in res: + # if r["url"] not in all_sources: + # all_sources.add(r["url"]) + + for r in res: if r["url"] not in all_sources: - all_sources.add(r["url"]) + all_sources[url] = [] + all_sources[url].append(r["title"]) + snippet = f"{r['snippet']})" snippets.append(snippet) #print("Current sub-snippet", snippet) @@ -268,6 +279,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: return { "query": qr, + "rag informations" : self.rag_results, "rag_summary": rag_summary if self.config.use_rag else None, "web_summary": web_summary_all if self.config.use_summary else None, "short_answer": final_short, @@ -279,7 +291,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: - def run(self): # RAG pipeline if self.config.use_rag: @@ -308,3 +319,57 @@ def run(self): with open(outp, 'w', encoding='utf-8') as f: json.dump(outputs, f, ensure_ascii=False, indent=2) print(f"Results saved to {outp}") + + + + + def run_api(self, use_rag, use_summary, query): + """ + Process queries and handle them with a temporary JSONL file. + + Parameters: + - use_rag (bool): Indicates whether to use RAG. + - use_summary (bool): Indicates whether to use summarization. + - query (list): List of query dictionaries. + + Returns: + - List of processed query results. + """ + # Save query to a temporary JSONL file + self.config.use_rag = use_rag + self.config.use_summary = use_summary + + temp_file_path = self._save_query_as_json(query) + + try: + outputs = [] + # Read from the temporary JSONL file + with open(temp_file_path, 'r', encoding='utf-8') as f: + if self.config.use_rag : + for line in f: + record = json.loads(line) + outputs.append(self.process_record(record)) + else: + for line in f: + record = json.loads(line.strip()) + outputs.append(self.process_record(record)) + + return outputs + + finally: + # Clean up the temporary file + print(f"Deleting temporary file: {temp_file_path}") + os.remove(temp_file_path) + + + def _save_query_as_json(self, query): + """Save query to a temporary JSONL file and return the file path.""" + suffix = '.json' if self.config.use_rag else '.jsonl' + with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False) as temp_file: + # Convert Pydantic models to dictionaries if needed + if isinstance(query, list): + temp_file.writelines(json.dumps(q.dict() if hasattr(q, "dict") else q) + '\n' for q in query) + else: + temp_file.write(json.dumps(query.dict() if hasattr(query, "dict") else query) + '\n') + print(f"Query saved to temporary file: {temp_file.name}") + return temp_file.name \ No newline at end of file From d1c513dc56ee1df24d87d182d30ce49e6f881254 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 23 Jun 2025 01:22:17 -0700 Subject: [PATCH 11/33] config file API --- examples/websearchRAG/config_api.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 examples/websearchRAG/config_api.yaml diff --git a/examples/websearchRAG/config_api.yaml b/examples/websearchRAG/config_api.yaml new file mode 100644 index 00000000..e2564c8f --- /dev/null +++ b/examples/websearchRAG/config_api.yaml @@ -0,0 +1,15 @@ +websearch: + rag_config_path: examples/rag/config_api.yaml + n_subqueries : 3 + n_loops: 2 + max_searches: 10 + mode: api + llm_config: + llm_name: OpenMeditron/meditron3-8b + max_new_tokens: 250 + +# Mode Config +mode_args: + endpoint: '/rag' + port: 8000 + host: 'localhost' From 1ac8e070f50d06b0627f19be53ff927f9ae8b3fd Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 23 Jun 2025 01:41:28 -0700 Subject: [PATCH 12/33] sources correct - websearch unreliable --- src/mmore/websearchRAG/pipeline.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index a754ef01..b55fb20c 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -14,6 +14,8 @@ from langchain_community.utilities import DuckDuckGoSearchAPIWrapper +from ddg import Duckduckgo + from langchain_core.messages import HumanMessage, SystemMessage from ..run_rag import rag @@ -164,7 +166,11 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: return [] - + def search_alternative(self, query): + dg_api = Duckduckgo() + results = dg_api.search(query) + print("query", query) + print("Ohh yeah yeah", results) def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[str]) -> Dict[str, str]: @@ -209,6 +215,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None all_sources: Set[str] = set() + source_map = {} current_context = rag_summary final_short, final_detailed = "", "" web_summary = "" @@ -227,6 +234,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: for sq in subs: #print("subquery:", sq) res = self.duckduckgo_search(query = sq) + self.search_alternative(sq) subquery_snippets = [] # for r in res: @@ -235,10 +243,10 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: for r in res: - if r["url"] not in all_sources: - all_sources[url] = [] - all_sources[url].append(r["title"]) - + if r["url"] not in source_map: + source_map[r["url"]] = [] # Initialize as a list for titles + source_map[r["url"]].append( r["title"]) # Add title to the list + snippet = f"{r['snippet']})" snippets.append(snippet) #print("Current sub-snippet", snippet) @@ -284,7 +292,8 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: "web_summary": web_summary_all if self.config.use_summary else None, "short_answer": final_short, "detailed_answer": final_detailed, - "sources": list(all_sources), + #"sources": list(all_sources), + "sources" : source_map, } From 381d6ccf7632fd685afa6126c84679c27a43fcd7 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 30 Jun 2025 02:13:50 -0700 Subject: [PATCH 13/33] clean + stop search function --- examples/websearchRAG/config.yaml | 12 +- .../websearchRAG/enhanced_results_trial.json | 292 +++++++++++++----- src/mmore/run_websearch.py | 18 +- src/mmore/websearchRAG/config.py | 6 +- src/mmore/websearchRAG/pipeline.py | 158 +++++----- src/mmore/websearchRAG/websearch.py | 46 +-- 6 files changed, 315 insertions(+), 217 deletions(-) diff --git a/examples/websearchRAG/config.yaml b/examples/websearchRAG/config.yaml index deb764a3..dfd34bc0 100644 --- a/examples/websearchRAG/config.yaml +++ b/examples/websearchRAG/config.yaml @@ -1,14 +1,14 @@ websearch: - use_rag: false + use_rag: true rag_config_path: examples/rag/config.yaml - use_summary: false - n_subqueries : 3 + use_summary: true + n_subqueries : 2 input_file: examples/rag/output.json input_queries: examples/rag/queries.jsonl output_file: examples/websearchRAG/enhanced_results_trial.json n_loops: 2 - max_searches: 10 + max_searches: 5 + mode: local llm_config: llm_name: OpenMeditron/meditron3-8b - max_new_tokens: 250 - mode: local \ No newline at end of file + max_new_tokens: 1200 diff --git a/examples/websearchRAG/enhanced_results_trial.json b/examples/websearchRAG/enhanced_results_trial.json index 54f940d1..9c42305f 100644 --- a/examples/websearchRAG/enhanced_results_trial.json +++ b/examples/websearchRAG/enhanced_results_trial.json @@ -1,88 +1,234 @@ [ { "query": "When was Barack Obama born?", - "rag_summary": null, - "web_summary": null, - "short_answer": "barack obama was born on august 4, 1961, in honolulu, hawaii.", - "detailed_answer": "barack obama was born in honolulu, hawaii, to barack obama sr., a kenyan economist, and ann dunham, an anthropologist from kansas. his parents met while attending the university of hawaii. they divorced when barack was two years old, leaving ann to raise him primarily on her own. obama graduated from columbia university and harvard law school. he was elected to the illinois senate in 1996 and the u.s. senate in 2004. in 2008, he became the first african-american president of the united states, serving two terms from 2009 to 2017.", - "sources": [ - "https://www.providencejournal.com/story/news/politics/2025/06/14/president-donald-trumps-birthday-is-june-14-how-old-is-he/84147169007/", - "https://www.aplustopper.com/essay-on-barack-obama/", - "https://kvia.com/politics/cnn-us-politics/2025/02/05/barack-obama-fast-facts-2/", - "https://www.mapsofworld.com/usa/presidents/barack-obama.html", - "https://www.havefunwithhistory.com/barack-obama-timeline/", - "https://facts.net/history/historical-events/38-facts-about-barack-obama-elected-u-s-president/", - "https://www.cnbctv18.com/world/barack-obama-turns-62-all-about-former-us-president-and-his-famous-quotes-19453865.htm", - "https://www.britannica.com/biography/Barack-Obama", - "https://www.britannica.com/facts/Barack-Obama", - "https://www.politifact.com/factchecks/2025/jan/13/threads-posts/no-this-isnt-proof-former-president-barack-obama-w/", - "https://www.britannica.com/biography/Barack-Obama/Politics-and-ascent-to-the-presidency", - "https://thegrio.com/2025/03/14/barack-obama/", - "https://publicfinanceinternational.org/when-is-obamas-birthday/" - ] + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. In *Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition*, pages 9556–9567, 2024.\n- [61] Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu\n\nJiang, Weiming Ren, Yuxuan Sun, Cong Wei, Botao Yu, Ruibin Yuan, Renliang Sun, Ming Yin, Boyuan Zheng, Zhenzhu Yang, Yibo Liu, Wenhao Huang, Huan Sun, Yu Su, and Wenhu Chen. Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. *arXiv preprint arXiv:2311.16502*, 2023.\n\n- [62] Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, and Lucas Beyer. Sigmoid loss for language image pre-training. In *Proceedings of the IEEE/CVF International Conference on Computer Vision*, pages 11975–11986, 2023.\n- [63] Ruohong Zhang, Liangke Gui, Zhiqing Sun, Yihao Feng, Keyang Xu, Yuanhan Zhang, Di Fu, Chunyuan Li, Alexander Hauptmann, Yonatan Bisk, et al. Direct preference optimization of video large multimodal models from language model reward. *arXiv preprint arXiv:2404.01258*, 2024.\n- [64] Renrui Zhang, Jiaming Han, Aojun Zhou, Xiangfei Hu, Shilin Yan, Pan Lu, Hongsheng Li, Peng Gao, and Yu Qiao. LLaMA-adapter: Efficient fine-tuning of large language models with zero-initialized attention. In *The Twelfth International Conference on Learning Representations*, 2024.\n\n\n[2] - Skills: Emphasized on emotional intelligence, interpersonal skills, and creativity. - Honors & Awards: Notable achievements include the Golden Globe Award for Best Television Series and the Golden Globe Award for Best Television Special. 2. Steve Jobs: - Professional Experience: Co-founder of Apple Inc., where he led the development of iconic products like the iMac, iPod, and iPhone. - Skills: Highlighted as a thinker with a focus on innovation and problem-solving. - Honors & Awards: Recognized with numerous accolades, including the MacArthur Fellowship and the NeXT Computer Science Award. Please note that these descriptions are based on the information provided in the images and do not include any personal details or specific accomplishments mentioned in the images.\n\nUser Which of them better suits the role of the Congressman of the US? Why?\n\nOurs The first resume, featuring Oprah Winfrey, would better suit the role of a Congressman because it highlights skills and experiences that are relevant to political office, such as leadership, public speaking, and media experience. These qualities are essential for a politician to effectively represent their constituents and engage in legislative processes. The second resume, while it could be suitable for a creative or entrepreneurial role, does not specifically mention skills or experiences that align with the responsibilities of a Congressman.\n\nTable 12. An example of emerging capability: multi-doc VQA.\n\n## References\n\n- [1] Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, et al. Flamingo: a visual language model for few-shot learning. *arXiv preprint*, 2022.\n- [2] Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, et al. Openflamingo: An open-source framework for training large autoregressive vision-language models. *arXiv preprint arXiv:2308.01390*, 2023.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhen was Barack Obama born?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBarack Obama was born on August 4, 1961.", + "rag_summary": "barack obama was born on august 4, 1961.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nthe provided context does not contain any information about barack obama's birthdate.", + "web_summary": "no useful information.", + "short_answer": "barack obama was born on august 4, 1961.", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: barack obama was born on august 4, 1961.\n\ndetailed answer: barack obama was born on august 4, 1961, in honolulu, hawaii, to barack obama, sr., and stanley ann dunham.", + "sources": { + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://www.geeksforgeeks.org/sql/how-to-use-a-subquery-in-a-select-statement/": [ + "How to Use a Subquery in a SELECT Statement - GeeksforGeeks" + ], + "https://www.datacamp.com/tutorial/sql-subquery": [ + "SQL Subquery: A Comprehensive Guide - DataCamp" + ], + "https://www.kdnuggets.com/beginners-guide-subqueries-sql": [ + "Beginner's Guide to Subqueries in SQL - KDnuggets" + ], + "https://www.britannica.com/biography/Barack-Obama": [ + "Barack Obama | Biography, Parents, Education, Presidency, Books ..." + ], + "https://kvia.com/politics/cnn-us-politics/2025/02/05/barack-obama-fast-facts-2/": [ + "Barack Obama Fast Facts - KVIA" + ], + "https://chroniclepaths.com/public-figures/biography-of-barack-obama": [ + "Biography of Barack Obama - chroniclepaths.com" + ], + "https://www.havefunwithhistory.com/barack-obama-timeline/": [ + "Barack Obama Timeline - Have Fun With History" + ], + "https://www.politifact.com/factchecks/2025/jan/13/threads-posts/no-this-isnt-proof-former-president-barack-obama-w/": [ + "Image shows former President Barack Obama was born in Kenya." + ], + "https://ohmyfacts.com/celebrities/45-facts-about-barack-obama/": [ + "45 Facts About Barack Obama - OhMyFacts" + ], + "https://www.geeksforgeeks.org/nested-queries-in-sql/": [ + "Nested Queries in SQL - GeeksforGeeks" + ], + "https://learn.microsoft.com/en-us/sql/relational-databases/performance/subqueries?view=sql-server-ver17": [ + "Subqueries (SQL Server) - SQL Server | Microsoft Learn" + ], + "https://factcheckhub.com/false-claim-about-obamas-birthplace-circulates-online/": [ + "False claim about Obama's birthplace circulates online" + ], + "https://www.havefunwithhistory.com/facts-about-barack-obama/": [ + "10 Facts About Barack Obama - Have Fun With History" + ], + "https://slcl.libguides.com/c.php?g=1426394&p=10653829": [ + "The U.S. Government for Kids and Teens: Barack Obama" + ], + "https://en.wikipedia.org/wiki/Early_life_and_career_of_Barack_Obama": [ + "Early life and career of Barack Obama - Wikipedia" + ], + "https://www.snopes.com/fact-check/birthing-pains/": [ + "Kenyan Government Releases Obama's Real Birth Certificate" + ], + "https://www.obamalibrary.gov/obamas/president-barack-obama": [ + "President Barack Obama | Barack Obama Presidential Library" + ] + } }, { "query": "Who founded Google?", - "rag_summary": null, - "web_summary": null, - "short_answer": "larry page and sergey brin founded google in 1998.", - "detailed_answer": "google was founded in 1998 by two ph.d. students at stanford university, larry page and sergey brin. initially, the company focused on developing a search engine that could provide relevant and accurate search results. over the years, google expanded its services to include a wide range of products and services, such as google drive, google maps, youtube, and google assistant. today, google is one of the most influential tech companies globally, with over 140,000 employees and a market value of over $1 trillion.", - "sources": [ - "https://worldhistoryjournal.com/2025/02/21/how-google-started-and-became-a-global-tech-powerhouse/", - "https://newswirejet.com/google-mission-and-vision-statement/", - "https://www.indiatoday.in/technology/news/story/google-co-founder-sergey-brin-is-back-from-semi-retirement-is-now-daily-working-at-google-2728026-2025-05-21", - "https://umatechnology.org/a-brief-history-of-google-from-1998-to-the-present-day/", - "https://www.britannica.com/money/Google-Inc", - "https://www.businessinsider.com/sergey-brin-career-life-education?op=1", - "https://www.reference.com/business-finance/google-s-vision-statement-66f207e3583fca5f-14", - "https://panmore.com/google-vision-statement-mission-statement", - "https://biographyhost.com/p/larry-page-biography.html", - "https://canvasbusinessmodel.com/blogs/mission/google-mission", - "https://interestingengineering.com/culture/almost-everything-you-need-to-know-about-googles-history", - "https://en.wikipedia.org/wiki/Larry_Page", - "https://www.link-assistant.com/news/how-old-is-google.html" - ] + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] - Skills: Emphasized on emotional intelligence, interpersonal skills, and creativity. - Honors & Awards: Notable achievements include the Golden Globe Award for Best Television Series and the Golden Globe Award for Best Television Special. 2. Steve Jobs: - Professional Experience: Co-founder of Apple Inc., where he led the development of iconic products like the iMac, iPod, and iPhone. - Skills: Highlighted as a thinker with a focus on innovation and problem-solving. - Honors & Awards: Recognized with numerous accolades, including the MacArthur Fellowship and the NeXT Computer Science Award. Please note that these descriptions are based on the information provided in the images and do not include any personal details or specific accomplishments mentioned in the images.\n\nUser Which of them better suits the role of the Congressman of the US? Why?\n\nOurs The first resume, featuring Oprah Winfrey, would better suit the role of a Congressman because it highlights skills and experiences that are relevant to political office, such as leadership, public speaking, and media experience. These qualities are essential for a politician to effectively represent their constituents and engage in legislative processes. The second resume, while it could be suitable for a creative or entrepreneurial role, does not specifically mention skills or experiences that align with the responsibilities of a Congressman.\n\nTable 12. An example of emerging capability: multi-doc VQA.\n\n## References\n\n- [1] Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, et al. Flamingo: a visual language model for few-shot learning. *arXiv preprint*, 2022.\n- [2] Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, et al. Openflamingo: An open-source framework for training large autoregressive vision-language models. *arXiv preprint arXiv:2308.01390*, 2023.\n\n\n[2] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[3] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[4] 4%)
39 | (23.3%)
17 | | 73 |\n| [44]
al.
et
Zhang | China
Jinhua, | days
1–16 | 14 | 14 | (35.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho founded Google?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGoogle was founded by Larry Page and Sergey Brin in 1998.", + "rag_summary": "google was founded by larry page and sergey brin in 1998.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ngoogle was founded by larry page and sergey brin in 1998.", + "web_summary": "no useful information is provided in the context to answer the question about who founded google.", + "short_answer": "google was founded by larry page and sergey brin in 1998.", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: google was founded by larry page and sergey brin in 1998.\n\ndetailed answer: google was founded on september 4, 1998, by american computer scientists larry page and sergey brin. since then, google has become the go-to search engine for internet users across the globe, from students researching for their term papers to businesses looking to attract more customers. the google search algorithm works by considering a number of factors and signals, such as the search query, content relevance, page useability, user location and source trustworthiness. the factors are then analyzed by the google ranking algorithm formula, which produces the rankings of pages in the google search results.", + "sources": { + "https://www.datacamp.com/tutorial/sql-subquery": [ + "SQL Subquery: A Comprehensive Guide - DataCamp" + ], + "https://www.geeksforgeeks.org/sql/sql-subquery/": [ + "SQL | Subquery - GeeksforGeeks" + ], + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://dev.to/abhay_yt_52a8e72b213be229/understanding-sql-subqueries-a-complete-guide-with-examples-3jff": [ + "Understanding SQL Subqueries: A Complete Guide with Examples" + ], + "https://en.wikipedia.org/wiki/Larry_Page": [ + "Larry Page - Wikipedia" + ], + "https://businesssharksmagazine.com/larry-page-and-sergey-brins-google-success/": [ + "Larry Page and Sergey Brin's Google Success Story" + ], + "https://press.farm/the-story-behind-google-search-engine/": [ + "The Story Behind Google - Pressfarm" + ], + "https://quartr.com/insights/business-philosophy/larry-page-the-mastermind-behind-google": [ + "Larry Page: The Mastermind Behind Google - Quartr" + ], + "https://www.britannica.com/money/Google-Inc": [ + "Google | History & Facts; Products & Services | Britannica Money" + ], + "https://www.jagranjosh.com/us/explainers/who-is-the-owner-of-google-1860000139": [ + "Who is the Owner of Google? - Jagran Josh" + ], + "https://interestingengineering.com/innovation/who-invented-google-and-how-did-it-happen": [ + "Who Invented Google and How did it Happen? - Interesting Engineering" + ], + "https://www.link-assistant.com/news/how-old-is-google.html": [ + "How Old Is Google? The Story Behind the Search Engine" + ], + "https://www.geeksforgeeks.org/sql/how-to-use-a-subquery-in-a-select-statement/": [ + "How to Use a Subquery in a SELECT Statement - GeeksforGeeks" + ], + "https://www.kdnuggets.com/beginners-guide-subqueries-sql": [ + "Beginner's Guide to Subqueries in SQL - KDnuggets" + ], + "https://datascienceexplain.com/sql-databases/mastering-sql-subqueries-a-complete-guide-to-writing-efficient-nested-queries/": [ + "Mastering SQL Subqueries: A Complete Guide to Writing Efficient Nested ..." + ], + "https://www.semrush.com/blog/google-search-algorithm/": [ + "How the Google Search Algorithm Works: A Zero-Fluff Guide - Semrush" + ], + "https://www.theedgesearch.com/2024/10/google-algorithms.html": [ + "Google algorithms explained: Everything you need to know" + ], + "https://www.searchenginejournal.com/google-algorithm-history/": [ + "Google Algorithm Updates & Changes: A Complete History" + ], + "https://www.searchlogistics.com/glossary/google-algorithm/": [ + "What Is The Google Algorithm? - SearchLogistics" + ] + } }, { "query": "Where is the Eiffel Tower located?", - "rag_summary": null, - "web_summary": null, + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] |---------------------|----------------------|------|------------------------|-------------------------|------------------|---------|--|\n| In-the-front | Interleaved | 52.9 | 36.8 | 30.5 | 70.1 | 74.0 | |\n| | In-the-front | 54.3 | 36.6 | 32.8 | 74.7 | 75.3 | |\n| Interleaved | Interleaved | 55.4 | 37.8 | 32.9 | 76.2 | 76.0 | |\n| | In-the-front | 52.4 | 36.1 | 29.0 | 72.9 | 71.8 | |\n| Mixed | Interleaved | 57.0 | 38.\n\n[2] |---------------------|----------------------|------|------------------------|-------------------------|------------------|---------|--|\n| In-the-front | Interleaved | 52.9 | 36.8 | 30.5 | 70.1 | 74.0 | |\n| | In-the-front | 54.3 | 36.6 | 32.8 | 74.7 | 75.3 | |\n| Interleaved | Interleaved | 55.4 | 37.8 | 32.9 | 76.2 | 76.0 | |\n| | In-the-front | 52.4 | 36.1 | 29.0 | 72.9 | 71.8 | |\n| Mixed | Interleaved | 57.0 | 38.\n\n[3] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[4] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[5] 7%)
5 | | | 14 |\n| [45]
al.
et
Ling | China
Shanghai, | days
22 | 66 | 66 | 66 | (88.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhere is the Eiffel Tower located?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe Eiffel Tower is located in Paris, France.", + "rag_summary": "the eiffel tower is located in paris, france.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nno useful information", + "web_summary": "no useful information.", "short_answer": "the eiffel tower is located in paris, france.", - "detailed_answer": "the eiffel tower is a wrought-iron structure located in paris, france. it is situated on the banks of the seine river in the 7th arrondissement of paris. the tower was designed and built by gustave eiffel and his team between 1887 and 1889 for the 1889 exposition universelle, a world's fair celebrating the 100th anniversary of the french revolution. the tower stands at an official height of 984 feet (300 meters), including its antennas, and is made of 18,038 pieces of metal connected with 2.5 million rivets. it is considered a technological masterpiece in building-construction history and is one of the most recognizable structures in the world, drawing millions of visitors each year.", - "sources": [ - "https://artenquire.com/2024/08/28/who-built-the-eiffel-tower-and-why/", - "https://duitdesign.com/who-designed-the-eiffel-tower-and-when-was-it-built.html", - "https://www.solosophie.com/eiffel-tower-wasnt-designed-by-gustave-eiffel/", - "https://heightandsize.com/eiffel-tower-height/", - "https://travelpander.com/eiffel-tower-history-and-facts/", - "https://www.travelandleisure.com/attractions/landmarks-monuments/eiffel-tower-facts", - "https://www.britannica.com/topic/Eiffel-Tower-Paris-France", - "https://travelpander.com/does-the-eiffel-tower-have-a-purpose/", - "https://travelpander.com/eiffel-tower-why-is-it-important/", - "https://parade.com/travel/eiffel-tower-facts", - "https://www.architecturaldigest.com/story/eiffel-tower-everything-you-need-to-know" - ] + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: the eiffel tower is located in paris, france.\ndetailed answer: the eiffel tower is a wrought-iron structure in paris, france that is one of the most famous landmarks in the world. it is also a technological masterpiece in building-construction history. it was designed and built (1887-89) by gustave eiffel and named in his honor. the tower drew almost two million visitors during the world's fair, proving its immediate popularity despite earlier opposition. while initially intended as a temporary structure, to be dismantled after 20 years, eiffel had the foresight to propose scientific uses for the tower, which ensured its survival. today, it remains one of the tallest and most recognizable structures in the world, drawing millions of visitors annually, with viewing platforms offering stunning panoramic views of paris.", + "sources": { + "https://learnsql.com/blog/sql-subquery-practice/": [ + "SQL Subquery Practice: 15 Exercises with Solutions" + ], + "https://www.dataquest.io/blog/sql-subqueries-for-beginners/": [ + "SQL Subqueries: A Beginner's Guide (with Code Examples) - Dataquest" + ], + "https://w3schools.tech/tutorial/sql/sql-sub-queries": [ + "Sub Queries - Advanced SQL - W3schools" + ], + "https://www.geeksforgeeks.org/sql/sql-subquery/": [ + "SQL | Subquery - GeeksforGeeks" + ], + "https://www.britannica.com/topic/Eiffel-Tower-Paris-France": [ + "Eiffel Tower | History, Height, & Facts | Britannica" + ], + "https://heightandsize.com/eiffel-tower-height/": [ + "How Tall is the Eiffel Tower? - heightandsize.com" + ], + "https://www.architecturaldigest.com/story/eiffel-tower-everything-you-need-to-know": [ + "Eiffel Tower: Everything You Need to Know - Architectural Digest" + ], + "https://gosta.media/en/society/tourism/what-height-eiffel-tower-15-interesting-facts-you-may-not-know/": [ + "15 Interesting Facts About The Eiffel Tower That You May Not Know" + ], + "https://travelpander.com/does-the-eiffel-tower-have-a-purpose/": [ + "Eiffel Tower: Purpose, History, Facts, and Practical Uses Explained" + ], + "https://www.travelandleisure.com/attractions/landmarks-monuments/eiffel-tower-facts": [ + "16 Eiffel Tower Facts: History, Science, and Secrets - Travel" + ], + "https://1worldplace.com/what-is-the-purpose-of-the-eiffel-tower/": [ + "What is the Purpose of the Eiffel Tower: Discover remarkable" + ], + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://www.dannydutch.com/post/the-eiffel-tower-from-controversy-to-icon-the-opening-that-changed-paris-forever": [ + "The Eiffel Tower: From Controversy to Icon - The Opening That Changed ..." + ], + "https://worldofhistorycheatsheet.com/history-of-the-eiffel-tower/": [ + "History of the Eiffel Tower | World of History" + ], + "https://worldhistoryjournal.com/2025/01/11/the-eiffel-tower-engineering-marvel-and-cultural-landmark/": [ + "The History of the Eiffel Tower: an Engineering and Cultural Landmark" + ] + } }, { "query": "When will the artificial intelligence conference be held?", - "rag_summary": null, - "web_summary": null, - "short_answer": "the ai conference will be held on september 17-18, 2025, in san francisco, california, us.", - "detailed_answer": "the ai conference is a two-day in-person conference discussing the latest in ai. it will be held on september 17-18, 2025, in san francisco, california, us. the conference will cover topics such as what's working in the best applied-ai startups, and technical lessons into the nuances of neural architectures, foundational models, alignment, and more. the cost of attending the conference ranges from $199.", - "sources": [ - "https://aaai.org/conference/aaai/aaai-25/main-technical-track/", - "https://engine.com/business-travel-guide/biggest-ai-expos-events-conferences-2025", - "https://www.iso.org/news/2025/01/world-first-international-ai-standards-summit-announced-in-davos", - "https://www.analyticsinsight.net/artificial-intelligence/top-ai-conferences-to-watch-out-for-in-2025", - "https://oxfordabstracts.com/blog/top-ai-conferences-to-attend/", - "https://www.forbes.com/sites/dianaspehar/2025/02/10/paris-ai-summit-2025-5-critical-themes-shaping-global-ai-policy/", - "https://vsynergize.com/blog/top-6-global-ai-summit-and-conferences-in-2025/", - "https://aaai.org/conference/aaai/aaai-25/", - "https://www.datacamp.com/blog/top-ai-conferences", - "https://www.analyticsvidhya.com/blog/2025/06/ai-conferences-2025/", - "https://datasciencedojo.com/blog/top-ai-conferences-in-usa/", - "https://www.ip-paris.fr/en/news/ai-action-summit-conference-ai-science-and-society-ip-paris", - "https://onu.delegfrance.org/ai-action-summit-10-and-11-february-2025" - ] + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] Q-bench: A benchmark for general-purpose foundation models on low-level vision. *arXiv preprint arXiv:2309.14181*, 2023.\n- [57] Junbin Xiao, Xindi Shang, Angela Yao, and Tat-Seng Chua. Next-qa: Next phase of question-answering to explaining temporal actions. In *Proceedings of the IEEE/CVF conference on computer vision and pattern recognition*, pages 9777–9786, 2021.\n- [58] Runsen Xu, Xiaolong Wang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. Pointllm: Empowering large language models to understand point clouds. *arXiv preprint arXiv:2308.16911*, 2023.\n- [59] Zhou Yu, Dejing Xu, Jun Yu, Ting Yu, Zhou Zhao, Yueting Zhuang, and Dacheng Tao. Activitynet-qa: A dataset for understanding complex web videos via question answering. In *Proceedings of the AAAI Conference on Artificial Intelligence*, volume 33, pages 9127–9134, 2019.\n- [60] Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu Jiang, Weiming Ren, Yuxuan Sun, et al. Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. In *Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition*, pages 9556–9567, 2024.\n- [61] Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu\n\n\n\n[2] Applied Data Analysis (CS401)\n\nLecture 1\nIntro to ADA\n11 Sep 2024\nMaria Brbić / Robert West\n‹#›\nImportant websites\n\nhttp://ada.epfl.ch\nYour main entry point. All materials linked from there.\nhttps://go.epfl.ch/ada2024-ed\nMain communication channel. Sign in with your EPFL email address (or simply access via Moodle).\n\nhttps://github.com/epfl-ada/2024\nUsed for exercises, homework, project, and final exam.\n‹#›\nPrevious instructor\nRobert West\nOn sabbatical in Microsoft Research, Seattle\n\n‹#›\nAbout your instructor this year\n~1500 people\n~ 3.850.000 people\n\n\n\nBorn in Tučepi, Croatia\nEducation:\nUniversity of Zagreb, Croatia\nUniversity of Tokyo\nStanford University, USA\nAssistant Professor at EPFL since Sep '22\nMachine Learning for Biomedicine (MLBio) lab\n‹#›\nOur research @ MLBio\nMethods\nMy Research\nBiomedicine\nMLBio\nLab\nMachine\nLearning\nChallenges\n‹#›\nOur research @ MLBio\nDevelop new AI methods\nUnsupervised learning, generative models, open-world learning etc\nCollaborate with biologists and medical researchers and have new \"untouched datasets\" or collect datasets\nGain new insights from these datasets 🡪 what are interesting questions that need new AI algorithms to be answered?\nApply AI algorithms we develop to advance biomedical research and drive new discoveries in biology and medicine\n‹#›\nOur research @ MLBio\n‹#›\nData analysis\n\n\"... the process of inspecting, cleaning, transforming, and modeling data with the goal of discovering useful information, suggesting conclusions, and supporting decision-making.\"\n\n\n[3] Applied Data Analysis (CS401)\n\nLecture 1\nIntro to ADA\n11 Sep 2024\nMaria Brbić / Robert West\n‹#›\nImportant websites\n\nhttp://ada.epfl.ch\nYour main entry point. All materials linked from there.\nhttps://go.epfl.ch/ada2024-ed\nMain communication channel. Sign in with your EPFL email address (or simply access via Moodle).\n\nhttps://github.com/epfl-ada/2024\nUsed for exercises, homework, project, and final exam.\n‹#›\nPrevious instructor\nRobert West\nOn sabbatical in Microsoft Research, Seattle\n\n‹#›\nAbout your instructor this year\n~1500 people\n~ 3.850.000 people\n\n\n\nBorn in Tučepi, Croatia\nEducation:\nUniversity of Zagreb, Croatia\nUniversity of Tokyo\nStanford University, USA\nAssistant Professor at EPFL since Sep '22\nMachine Learning for Biomedicine (MLBio) lab\n‹#›\nOur research @ MLBio\nMethods\nMy Research\nBiomedicine\nMLBio\nLab\nMachine\nLearning\nChallenges\n‹#›\nOur research @ MLBio\nDevelop new AI methods\nUnsupervised learning, generative models, open-world learning etc\nCollaborate with biologists and medical researchers and have new \"untouched datasets\" or collect datasets\nGain new insights from these datasets 🡪 what are interesting questions that need new AI algorithms to be answered?\nApply AI algorithms we develop to advance biomedical research and drive new discoveries in biology and medicine\n‹#›\nOur research @ MLBio\n‹#›\nData analysis\n\n\"... the process of inspecting, cleaning, transforming, and modeling data with the goal of discovering useful information, suggesting conclusions, and supporting decision-making.\"\n\"Data analysis has multiple facets and approaches, encompassing diverse techniques under a variety of names, in different business, science, and social science domains.\"\n‹#›\nApplied data analysis\nThis course is about breadth, not depth\n\"What methods, principles, and tools are out there?\", rather than \"How can I become an expert in deep learning for computer vision applied to images of cats?\"\nData science is a fast-paced, shifting field\nObsessing on one tool or technique won't pay off in a few years\nBe ready to explore and keep learning on your own\nComplementary courses:\nMachine learning\nNLP\nDIS\nData viz\nGoal of this class: Enable you to conduct a\nfull-fledged data science project from start to finish\nThat being said, depth matters too…\n‹#›\nLet's abbreviate this course as Ada, not A-D-A, in honor of Ada Lovelace, \"the world's first computer programmer.\"\nhttps://en.wikipedia.org/wiki/Ada_Lovelace\n\n‹#›\n\n‹#›\nSyllabus\nHandling data\n\"Slicing and dicing\": obtaining, preparing, juggling data\nVisualizing data\nExploration of data, communication of results\nDescribing data\nHow to support (and be suspicious of) claims about data\nRegression analysis for disentangling data\nHow to disentangle datasets with correlated variables\nCausal analysis of observational data\nHow to deal with \"found data\"\nCorrelation != causation\n‹#›\nSyllabus (cont'd)\nLearning from data\nSupervised learning\nUnsupervised learning\nApplied aspects of machine learning\nHandling specific types of data\nHandling text data\nHandling network data\nScaling to massive data\n‹#›\nGrading\n35% Homework assignments (2)\nInvolving skills required from data scientists\nGroups of 5 students\nHomework of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n35% Final exam\nMini data analysis project\nDone on laptop, individually, on campus\nFinal exams of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n30% Project (more details soon)\nYour own freestyle data analysis\nDone in groups of 5 students (same as for homework)\nMilestones spread throughout the semester\nProjects of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n‹#›\nGrading\n35% Homework assignments (2)\nInvolving skills required from data scientists\nGroups of 5 students\nHomework of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n35% Final exam (date TBD)\nMini data analysis project\nDone on laptop, individually, on campus\nFinal exams of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n30% Project (more details soon)\nYour own freestyle data analysis\nDone in groups of 5 students (same as for homework)\nMilestones spread throughout the semester\nProjects of 2017, 2018, 2019, 2020, 2021, 2022, 2023\nThis class will be hard work,\nbut it will get you a job.\n‹#›\nGrading (cont'd)\nTo obtain a meaningful grade distribution, scaling/shifting will be applied to each of {homework, project, exam, quizzes} before taking weighted average (standard practice at EPFL)\nWhile intermediate grades are a good indication of where you stand, remember there might be some wiggle\n→ Don't rely on intermediate grades to decide whether you can afford to skip the exam etc.\n‹#›\nDeadlines\nHomeworks\nHomework 1\nRelease Oct 4th 2024\nDue Oct 18th 2024\nHomework 2\nRelease Nov 15th 2024\nDue Nov 29th 2024\nFinal exam\nDate TBD\nProject deliverables\nProject milestone P1\nDue Oct 4th 2024\nProject milestone P2\nDue Nov 15th 2024\nProject milestone P3\nDue Dec 20th 2024\nAll deadlines are 23:59 CET\n‹#›\nMeeting logistics: Lectures\nWednesdays 8:15–10:00\nIf you want to see it live, come to class! (No live streaming)\nLectures are also recorded and made available after class\n‹#›\nMeeting logistics: Lab sessions\nFridays 13:15–14:45\nIn person only:\nCO 1\nCM 1 120\nCM 1 121\nLabs are complementary to lectures, not simply more detail on same\nYou solve exercises that we make available the day before, can ask questions and get help from assistants\nIn certain weeks: homework/project office hours (probably on Zoom, in parallel to exercises)\n‹#›\nWeekly quizzes\nAvailable online on Moodle after every lecture\n5 questions, to be answered within 10 minutes of starting\nQuiz 1: no real questions, just to let you get familiar with the setup\nQuiz 2: the first quiz with lecture material questions\nQuiz i is about lecture material of week i\nGoal:\nEngage continuously with course material\nThink (not just find right slide)\nNot graded, for you to recap lecture materials\n‹#›\nProject\nWe'll provide a number of datasets\nYou need to form and pitch a crisp project idea\nFree to combine with other datasets (at your own risk)\nGoal: not a loose collection of results – tell a story with the data!\nData stories of 2017, 2018, 2019, 2020, 2021, 2022, 2023\nNice example data story\n‹#›\n\nHomework and projects: GitHub\nDe-facto standard for managing and sharing code\nAll students in this class need a GitHub account\nHomework and project submissions done via GitHub\nADA Github repository:\nhttps://github.com/epfl-ada/2024\n‹#›\nMain communication channel:\nClass forum, available via Moodle\nAlso accessible directly, outside of Moodle:https://go.epfl.ch/ada2024-ed (sign in using the same email address as for Moodle)\nCentral place to ask all class-related questions\nDon't send us email\nMandatory! We'll send important announcements on Ed only\nHelp each other (without cheating, of course)\n‹#›\nWatch-at-home videos\nThroughout the semester, we'll release videos with supplemental information; e.g.,\nIntro to lab sessions (already available!)\nProject instructions\nHomework 1 postmortem\nHomework 2 postmortem\n‹#›\nGeneral note on communication\nMultiple platforms used in ADA for various tasks (as in real life): Ed, GitHub, Google docs, ADA website\nTo avoid confusion,\nfamiliarize yourself with communication guidelines\nall materials will be linked from the website as a central point of entry: https://ada.epfl.ch\nall discussions will take place on Ed\n‹#›\nCommercial break\nADA students: sharp like teeth!\n‹#›\nGroup registration\nMust form teams within 2 weeks, starting now (in time for release of Homework 1)!\nGet started immediately to find 4 other teammates\nBy Fri 27 Sep 23:59, complete the registration form (to be done by each team member individually):https://go.epfl.ch/ada2024-team-registration\nCan change team after Homework 1 (but try to avoid it)\n‹#›\nPrerequisites\nBasics of\nprobabilities and stats\ndatabases\nprogramming\nYou won't survive if you can't program\nHomework, exam: Python required\nProject: up to you, but we support only Python\nBrush up your Python skills (many great online courses out there)\n‹#›\nPython environments\nHomeworks and exams to be done as Jupyter Notebooks\nYou will submit a pre-executed .ipynb file\nWe don't care how you produce it\nOption 1: local Python installation (e.g., Anaconda + JupyterLab)\nOption 2: Google Colab = notebook hosted by Google\nOption 3: noto = notebook hosted by EPFL\nTo get started: come to Friday's lab session (\"Exercise 0\")\n\"Homework 0\": do it yourself at home after lab session (optional, not graded)\nDoing Homework 0 is the best way of making sure you're set up correctly for later homework, project, exam\n‹#›\nPython++\n‹#›\nPOLLING TIME\n\"What is your prior experience with Python?\"\nScan QR code or go tohttps://app.sli.do/event/iPMVwwrojFn9EaNp6iiDDx\n\n\n‹#›\nInstructor\nTAs: Teaching assistants (PhD students)\nHead TAs\n\n\n\n\nTimDavidson\nSepideh\nMamooler\nAoxiangFan\nMaria Brbić\nMarija Šakota\nSaibo Geng\n\n\n\n\nIvan\nZakazov\nBettina\nMessmer\nShuqi\nWang\nShawn Fan\nShuangqi\nLi\nYist\nYu\nYulun\nJiang\nArtyomGadetsky\nSAs: Student assistants (Master students)\n\n\n\n\n\n\n\n\n\n\n\nStefan\nKrsteski\nXi\nLei\nMatea\nTashkovska\nElisa\nBillard\nPablo\nMenéndez\nJiaming Jiang\nJakhongir Saydaliev\nAllocio\nJeanne\nYannis Laaroussi\nSebastien\nChahoud\nYagiz\nGençer\nTianhao\nDai\nTugba\nTümer\n\nHelp each other on Ed\nParticipate actively in classes and labs\nGive us feedback\n‹#›\nFeedback\nGive us feedback on this lecture here: https://go.epfl.ch/ada2024-lec1-feedback\nFeedback form available for each lecture and lab session\nWhat did you (not) like about this lecture?\nWhat was (not) well explained?\nOn what would you like more (fewer) details?\n…\n‹#›\nQuestions?\n\n‹#›\nWhat is data science?\n‹#›\nWhy now?\nI WISH I HAD BECOME A DATA SCIENTIST BACK IN 2023…\n‹#›\n\"Data science\"\nMost science is (or should be) based on data, per definitionem\nSo how is \"data science\" different from plain old \"science\"?\n‹#›\nData volume explodes\n\"Between the dawn of civilization and 2003, we only created five exabytes of information; now [in 2010] we're creating that amount every two days.\"\nEric Schmidt, Google (2010)\n‹#›\nData variety explodes\nText (indexed Web pages, email), networks (Web graph, knowledge graph), images, maps, logs (search logs, server logs, GPS logs), speech, …\n\n‹#›\nNeeded: A method to the madness\n\nScientific method 1.0:- Focused on \"Model the data\"- Scientist has hypothesis prior to analyzing the data\nScientific method 2.0:\n- Data-driven science - Systematic cycle (see diagram) - \"Explore the data\" becomes increasingly important\nData as a first-class citizen\n‹#›\nScientist 2.0\n‹#›\n\"A data scientist is someone who can obtain, scrub, explore, model, and interpret data, blending hacking, statistics, and machine learning. Data scientists not only are adept at working with data, but appreciate data itself as a first-class product.\"\nHilary Mason, chief scientist at bit.ly\n‹#›\nJosh Wills, Data Scientist at Slack\n‹#›\n‹#›\n‹#›\nMore data often beats better algorithms\n\n\nhttp://www.incompleteideas.net/IncIdeas/BitterLesson.html\n‹#›\n21st-century politics\n\n\n\n\nWe ask: Do these subjective impressions reflect the true state of US political discourse?\n\nADA will teach you the tools to answer such questions using data (see next slides)\nhttps://www.pewresearch.org/politics/wp-content/uploads/sites/4/2019/06/PP_2019.06.19_Political-Discourse_FINAL.pdf\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nData: https://github.com/epfl-dlab/Quotebank\nWeb interface: https://quotebank.dlab.tools/\n‹#›\nSyllabus, revisited\n\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n‹#›\nSyllabus, revisited\n\"Is the effect real, or could it have been produced by chance?\"\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n\n\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n\"What caused the observed increase in negativity?\"\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nQuotes were attributed to speakers by a machine learning algorithm\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nResearch question (\"Did political discourse become more negative?\") is a question about language == text\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n\"Who speaks about whom in what way?\" → Construct \"who-mentions-whom\" network\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nExtracting 235 million quotes from 127 million news articles (about 1 terabyte of raw text) spanning 12 years requires big-data tools (e.g., Spark)\n‹#›\nCurious to learn more?\nFull paper available at https://www.nature.com/articles/s41598-023-36839-1\n\n‹#›\nTODO before Friday's lab session\nSign up for Ed here and familiarize yourself with it\nIf you're not on GitHub yet, sign up for GitHub\nStart looking for 4 teammates\nYou may use \"Group formation\" category on Ed\nCheck out Google Colab and noto (to see if you want to use either of them)\nCheck out Exercise 0 here (in prep for Fri lab session)\n‹#›\nAny feedback? -- Let us know!\nGive us feedback on this lecture here: https://go.epfl.ch/ada2024-lec1-feedback\nFeedback form available for each lecture and lab session\nWhat did you (not) like about this lecture?\nWhat was (not) well explained?\nOn what would you like more details?\nWhat would you like the instructor to wear next time?\n…\n‹#›<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhen will the artificial intelligence conference be held?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI don't have access to specific event schedules. However, you can check the conference website or contact the organizers for more information about the conference schedule.", + "rag_summary": "i don't have access to specific event schedules. however, you can check the conference website or contact the organizers for more information about the conference schedule.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nno useful information", + "web_summary": "no useful information is provided in the context to determine the date of the artificial intelligence conference.", + "short_answer": "no useful information", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: no useful information\ndetailed answer: no useful information is provided to determine the date of the artificial intelligence conference.", + "sources": { + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://www.geeksforgeeks.org/sql/how-to-use-a-subquery-in-a-select-statement/": [ + "How to Use a Subquery in a SELECT Statement - GeeksforGeeks" + ], + "https://www.datacamp.com/tutorial/sql-subquery": [ + "SQL Subquery: A Comprehensive Guide - DataCamp" + ], + "https://www.kdnuggets.com/beginners-guide-subqueries-sql": [ + "Beginner's Guide to Subqueries in SQL - KDnuggets" + ], + "https://aaai.org/conference/aaai/aaai-25/": [ + "The 39th Annual AAAI Conference on Artificial Intelligence" + ], + "https://engine.com/business-travel-guide/biggest-ai-expos-events-conferences-2025": [ + "The Biggest AI Conferences & Events Happening in 2025" + ], + "https://www.techtarget.com/whatis/feature/Top-AI-conferences-and-virtual-events": [ + "12 AI conferences to attend in 2025 - TechTarget" + ], + "https://www.analyticsinsight.net/artificial-intelligence/top-ai-conferences-to-watch-out-for-in-2025": [ + "Top AI Conferences to Watch Out for in 2025 - Analytics Insight" + ], + "https://datascienceexplain.com/sql-databases/mastering-sql-subqueries-a-complete-guide-to-writing-efficient-nested-queries/": [ + "Mastering SQL Subqueries: A Complete Guide to Writing Efficient Nested ..." + ], + "https://aaai.org/conference/aaai/aaai-25/program-overview/": [ + "The 39th Annual AAAI Conference on Artificial Intelligence" + ], + "https://www.datacamp.com/blog/top-ai-conferences": [ + "Top 10 AI Conferences for 2025 - DataCamp" + ], + "https://www.nimblework.com/blog/ai-conferences-and-events-2025/": [ + "Top AI Conferences And Events Of 2025 - nimblework.com" + ] + } } ] \ No newline at end of file diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index ac387e82..8b6bce8b 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -23,10 +23,9 @@ from .run_rag import create_api as create_api_rag -from .websearchRAG.logging_config import logger # Import the shared logger +from .websearchRAG.logging_config import logger -#à quoi ça sert? load_dotenv() @@ -72,9 +71,10 @@ def run_websearch(config_file): class QueryInput(BaseModel): input: str = Field(..., description="The user query") collection_name: Optional[str] = Field( - None, description="The collection to search (optional)" + None, description="The collection to search if use_rag set to True" ) + class WebQuery(BaseModel): query: QueryInput = Field( ..., @@ -150,15 +150,3 @@ def websearch(query: WebQuery): ) args = parser.parse_args() run_websearch(args.config_file) - - - - -# { -# "query": { -# "input": "When was Barack Obama born?", -# "collection_name": "my_docs" -# }, -# "use_rag": true, -# "use_summary": true -# } \ No newline at end of file diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index 05461739..3982699e 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -1,7 +1,7 @@ # mmore/websearch/config.py from dataclasses import dataclass, field -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Literal from pathlib import Path import yaml @@ -36,8 +36,8 @@ class WebsearchConfig: n_subqueries: int = 3 n_loops: int = 2 max_searches: int = 10 - llm_config: Dict[str, Any] = field(default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 100}) - mode: str = "local" + llm_config: Dict[str, Any] = field(default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200}) + mode: Literal["local", "api"] = "local" def __post_init__(self): required_fields = ["rag_config_path", "llm_config", "mode"] diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index b55fb20c..d8e169d2 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -8,14 +8,13 @@ import time import tempfile import os +from dataclasses import dataclass,asdict from langchain_community.tools import DuckDuckGoSearchResults from duckduckgo_search.exceptions import RatelimitException from langchain_community.utilities import DuckDuckGoSearchAPIWrapper -from ddg import Duckduckgo - from langchain_core.messages import HumanMessage, SystemMessage from ..run_rag import rag @@ -25,7 +24,15 @@ -# python -m mmore websearch --config-file examples/websearchRAG/config.yaml +@dataclass +class ProcessedResponse: + query: str + rag_informations: str + rag_summary: str + web_summary: str + short_answer: str + detailed_answer: str + sources: Dict[str, Any] # Maps URLs to lists of titles @@ -39,7 +46,7 @@ def __init__(self, config: WebsearchConfig): self.config = config self.llm = self._initialize_llm() self.rag_results = None - self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches, backend='auto') + self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches) self.search = DuckDuckGoSearchResults(api_wrapper=self.wrapper, output_format="list") @@ -52,13 +59,11 @@ def _initialize_llm(self) -> LLM: return LLM.from_config(LLMConfig(**llm_conf)) else : base_conf = self.config.get_llm_config() - if isinstance(base_conf, LLMConfig): # Ensure it's a dictionary - base_conf = base_conf.__dict__ + base_conf = base_conf.__dict__ return LLM.from_config(LLMConfig(**base_conf)) - - def generate_summary(self, rag_answer: str, query: str) -> str: + def generate_summary(self, rag_answer, query: str): """ Summarize the RAG answer (used when rag_summary=True) """ @@ -68,30 +73,44 @@ def generate_summary(self, rag_answer: str, query: str) -> str: "Context:\n" f"{rag_answer}\n\n" "If the context contains the answer or any useful information, respond with that information. \n" - "If no useful informations are, answer: no useful informations" + "If no useful informations are, answer: no useful informations\n" "Answer: \n" "---------------------------" ) - if not self.config.use_rag: - return None - messages = [ SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), HumanMessage(content=prompt), ] response = self.llm.invoke(messages) - # print("##SUMMARY CLEAN##") - # print(self._clean_llm_output(response.content)) return self._clean_llm_output(response.content) + def evaluate_subquery_relevance(self, query, current_subqueries, previous_subqueries): + prompt = ( + f"Original query:\n{query}\n\n" + f"Previous subqueries that contribute to understanding:\n{previous_subqueries}\n\n" + f"New subqueries:\n{current_subqueries}\n\n" + "Are any of the new subqueries relevant in the context of the original query and previous subqueries? " + "Respond strictly with 'yes' if at least one is relevant, or 'no' if none are." + ) + messages = [ + SystemMessage(content="You are a helpful assistant"), + HumanMessage(content=prompt), + ] + response = self.llm.invoke(messages) + response = self._clean_llm_output(response.content) + + if 'no' in response : + return False + else : + return True def _clean_llm_output(self, content: str): delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" if delimiter not in content: - return [] if extract_subqueries else "" + return content # Extract the section after the delimiter cleaned_section = content.split(delimiter, 1)[-1].lower().strip() @@ -99,8 +118,6 @@ def _clean_llm_output(self, content: str): return cleaned_section - - def generate_subqueries( self, original_query: str, @@ -134,9 +151,6 @@ def generate_subqueries( response = self.llm.invoke(messages) cleaned_answer = self._clean_llm_output(response.content) cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) - # print("######") - # print("Clean response: ", cleaned_answer) - # print("--------------------") return cleaned_answer @@ -148,7 +162,6 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: Returns a list of dicts with keys: 'title' and 'url' """ - time.sleep(2) # delay to try to avoid error 202 ### TO BE IMPROVED ### try: results = self.search.invoke(query) @@ -157,22 +170,15 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: snippet = r.get("snippet", "") url = r.get("link", "") # note: it's "link" in LangChain results title = r.get("title", "") - if url: - formatted_results.append({"snippet": snippet, "url": url, "title" : title}) - print("Websearch", formatted_results) + + formatted_results.append({"snippet": snippet, "url": url, "title" : title}) + return formatted_results except Exception as e: logger.error(f"DuckDuckGo search error: {e}") return [] - def search_alternative(self, query): - dg_api = Duckduckgo() - results = dg_api.search(query) - print("query", query) - print("Ohh yeah yeah", results) - - def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[str]) -> Dict[str, str]: # Build prompt for short & detailed answer sources = "\n".join(web_snippets) @@ -190,7 +196,7 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st resp = self.llm.invoke(msgs) # parse clean_content = self._clean_llm_output(resp.content) - + sa_matches = re.findall( r"short answer:\s*(.*?)(?=detailed answer:)", clean_content, @@ -201,19 +207,20 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st clean_content, flags=re.IGNORECASE|re.DOTALL ) + short = sa_matches[-1].strip().rstrip(",") if sa_matches else "" detailed = da_matches[-1].strip() if da_matches else "" return {"short": short, "detailed": detailed} - def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: qr = rec.get("input", "").strip() rag_ans = rec.get("answer", "") if self.config.use_rag else "" self.rag_results = rag_ans rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None + all_sources: Set[str] = set() source_map = {} current_context = rag_summary @@ -221,6 +228,8 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: web_summary = "" web_summaries = [] + previous_sub = [] + for loop in range(self.config.n_loops): if self.config.use_rag: @@ -231,42 +240,40 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: snippets, urls = [], [] subquery_summaries = [] + + if loop > 0 and not self.evaluate_subquery_relevance(qr, subs, previous_sub): + break + for sq in subs: - #print("subquery:", sq) + time.sleep(10) res = self.duckduckgo_search(query = sq) - self.search_alternative(sq) subquery_snippets = [] - # for r in res: - # if r["url"] not in all_sources: - # all_sources.add(r["url"]) - for r in res: if r["url"] not in source_map: - source_map[r["url"]] = [] # Initialize as a list for titles - source_map[r["url"]].append( r["title"]) # Add title to the list - + source_map[r["url"]] = [] + + if r["title"] not in source_map[r["url"]]: + source_map[r["url"]].append(r["title"]) + snippet = f"{r['snippet']})" snippets.append(snippet) - #print("Current sub-snippet", snippet) subquery_snippets.append(snippet) - # Summarize each subquery's snippets independently if rag_summary is True - if self.config.use_summary: - if subquery_snippets: - combined_snippets = "\n".join(subquery_snippets) - summary = self.generate_summary(combined_snippets, sq) - subquery_summaries.append(summary) - else: - subquery_summaries.append("") - - if self.config.use_summary: - combined_sub_summaries = "\n".join([str(s) if s else "" for s in subquery_summaries]) - web_summary = self.generate_summary(combined_sub_summaries, qr) - web_summaries.append(web_summary) - #print("Current websummary: ", web_summary) + combined_snippets = "\n".join(subquery_snippets) + + summary = self.generate_summary(combined_snippets, sq) + subquery_summaries.append(summary) + + previous_sub = subs + + combined_sub_summaries = "\n".join([str(s) if s else "" for s in subquery_summaries]) + web_summary = self.generate_summary(combined_sub_summaries, qr) + web_summaries.append(web_summary) + + if self.config.use_summary: # Combine rag summary, web summary, and original query for final answer context_for_llm = f"RAG informations:\n{rag_summary or ''}\n\nWeb informations:\n{web_summary}" else: @@ -276,6 +283,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: combined_web_summaries = "\n".join([str(s) if s else "" for s in web_summaries]) web_summary_all = self.generate_summary(combined_web_summaries, qr) + print(web_summary_all) # Current context, web content to generate the answer out = self.integrate_with_llm(qr, context_for_llm, snippets) final_short, final_detailed = out["short"], out["detailed"] @@ -283,20 +291,17 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: # Prepare context for next search loop current_context = final_detailed + solution = ProcessedResponse( + query=qr, + rag_informations=self.rag_results, + rag_summary=rag_summary if self.config.use_rag else None, + web_summary=web_summary_all, + short_answer=final_short, + detailed_answer=final_detailed, + sources=source_map + ) - - return { - "query": qr, - "rag informations" : self.rag_results, - "rag_summary": rag_summary if self.config.use_rag else None, - "web_summary": web_summary_all if self.config.use_summary else None, - "short_answer": final_short, - "detailed_answer": final_detailed, - #"sources": list(all_sources), - "sources" : source_map, - } - - + return asdict(solution) @@ -320,14 +325,15 @@ def run(self): outputs = [] - for rec in data: - outputs.append(self.process_record(rec)) + outputs = [self.process_record(rec) for rec in data] + + # save outp = Path(self.config.output_file) outp.parent.mkdir(exist_ok=True, parents=True) with open(outp, 'w', encoding='utf-8') as f: json.dump(outputs, f, ensure_ascii=False, indent=2) - print(f"Results saved to {outp}") + logger.info(f"Results saved to {outp}") @@ -366,8 +372,8 @@ def run_api(self, use_rag, use_summary, query): return outputs finally: - # Clean up the temporary file - print(f"Deleting temporary file: {temp_file_path}") + # Delete the temporary file + logger.info(f"Deleting temporary file: {temp_file_path}") os.remove(temp_file_path) @@ -380,5 +386,5 @@ def _save_query_as_json(self, query): temp_file.writelines(json.dumps(q.dict() if hasattr(q, "dict") else q) + '\n' for q in query) else: temp_file.write(json.dumps(query.dict() if hasattr(query, "dict") else query) + '\n') - print(f"Query saved to temporary file: {temp_file.name}") + logger.info(f"Query saved to temporary file: {temp_file.name}") return temp_file.name \ No newline at end of file diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index 6c6667ce..7edda06b 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -14,55 +14,13 @@ def websearch_pipeline(self, query: str) -> Dict[str, str]: web_output = search.run(query) return web_output - - - def multiple_queries(self, original_query: str, list_of_queries: List[str], keep_intact: bool) -> List[Dict[str, str]]: - """Perform multiple web searches in parallel for a list of queries.""" - results = [] - with ThreadPoolExecutor() as executor: - future_to_query = {executor.submit(self.websearch_pipeline, query): query for query in list_of_queries} - for future in as_completed(future_to_query): - query = future_to_query[future] - try: - result = future.result() - if keep_intact: - to_keep = self.check_source_of_web_search(original_query, result) - if to_keep: - results.append({'query': query, 'result': result, 'summary' : None}) - else: - summary = self.resume_web_search(original_query, result) - results.append({'query': query, 'result': result, 'summary': summary}) - except Exception as e: - results.append({'query': query, 'error': str(e)}) - - return results - - - - def check_source_of_web_search(self, query: str, web_output: str) -> bool: - """Call LLM to determine if the current web output is useful based on the original query.""" - llm = LLM() # TODO: Implement LLM - prompt = ( - f"Original Query: '{query}'\n" - f"Web Output: '{web_output}'\n" - "Is the web output useful for the original query? Answer with 'True' or 'False'." - ) - response = llm.invoke(prompt) - return response.strip().lower() == 'true' - def resume_web_search(self, query: str, web_output: str) -> str: """Call LLM to resume the current web output based on the original query, return a summary of the web search and the source.""" - llm = LLM() # TODO: Implement LLM + llm = LLM() prompt = ( f"Original Query: '{query}'\n" f"Web content: '{web_output}'\n" "Based on the original query and the web content, can you provide a response to the original query?" ) response = llm.invoke(prompt) - return response.strip() ### RAJOUTER LA SOURCE - -# Example usage: -# websearch = WebsearchOnly() -# results = websearch.multiple_queries("original_query", ["query1", "query2", "query3"], keep_intact=True) -# for res in results: -# print(res) + return response.strip() From 3cde125e30e89742477658871522d392d62e9072 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 30 Jun 2025 02:32:05 -0700 Subject: [PATCH 14/33] websearch doc --- docs/websearch.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 docs/websearch.md diff --git a/docs/websearch.md b/docs/websearch.md new file mode 100644 index 00000000..454fea46 --- /dev/null +++ b/docs/websearch.md @@ -0,0 +1,105 @@ +# WebSearch Integration in RAG Pipeline + + + + +## Implementation + +### Overview +The WebSearch integration uses the DuckDuckGo search API through a wrapper provided by the LangChain library. The implementation combines: + +- **DuckDuckGo Search:** Retrieves external information through concise subqueries generated by a LLm +- **LLM Integration:** Summarizes and integrates retrieved web snippets with RAG results to provide a final answer + +### Workflow + +0. **RAG pipeline:** + - Launch the RAG pipeline if `use_rag` is enabled and the set the output as being the current knowledge +1. **Input Query Processing:** + - The pipeline processes the user query and generates subqueries for web searches in order to complete the current knowledge. +2. **WebSearch Execution:** + - DuckDuckGo searches are performed for each subquery +3. **Summarization:** + - Retrieved web snippets are summarized using an LLM if `use_summary` is enabled. +4. **Integration with RAG (if use_rag):** + - WebSearch results are combined with RAG outputs to generate the current knowledge +5. **Start again:** + - We loop again from step 1 with the updated current knowledge + + +## Customization + +Based on the implementation of the `RAG` module, the `Websearch` module enables the creation of a modular Websearch inference pipeline for your indexed multimodal documents, using two inference modes: + 1. **API**: Creates a server hosting the pipeline + 2. **Local**: Runs the inference locally + +You can customize various parts of the pipeline by defining [an inference RAG configuration file](/examples/rag/api/rag_api.yaml). + + +Users can adjust the pipeline according to their [requirements](/examples/websearch/config.yaml) through the following parameters: + +- `use_rag`: Enables or disables RAG retrieval. +- `use_summary`: Activates summarization of retrieved web snippets. +- `n_loops`: Defines the number of search iterations to refine results. +- `n_subqueries`: Specifies the number of subqueries generated for each input query. + + + + +## Minimal Example + +Here is a example to create a Websearch pipeline hosted through [LangServe](https://python.langchain.com/docs/langserve/) servers. + +1. Create your RAG Inference config file based on the [local example](/examples/websearch/config.yaml) or the [API example](/examples/websearch/config_api.yaml). + +2. Start your Websearch pipeline using the `run_websearch.py` script and your config file + ```bash + python3 -m mmore websearch --config_file /path/to/config.yaml + ``` + +3. In API mode, query the server like any other LangServe server: + ```bash + curl --location --request POST http://localhost:8000/websearch/ \ + -H 'Content-Type: application/json' \ + -d ' { + "query": { + "input": "When was Barack Obama born?", + "collection_name": "my_docs" + }, + "use_rag": true, + "use_summary": true + } + }' + ``` + + In local mode, the pipeline is run directly with the input data specified in the configuration file and the result is saved at the specified path. + + + +## Results and Outputs + +### Output Format +The pipeline provides outputs in the following structure: + +- **Short Answer:** Concise response derived from combined RAG and WebSearch results. +- **Detailed Answer:** Expanded response with context from both sources. +- **Sources:** A list of URLs and their respective titles in the format: + ``` + URL: https://example.com, Title {Title 1; Title 2; ...} + ``` + +### Example Output +```json +{ + "query": "What are the latest advancements in AI?", + "rag_informations": "Pre-existing knowledge from RAG.", + "rag_summary": "Summary of RAG knowledge.", + "web_summary": "Summarized web search information.", + "short_answer": "AI advancements include new models and real-time applications.", + "detailed_answer": "Recent AI developments include breakthroughs in large language models and innovative real-time applications, supported by diverse resources from RAG and web search.", + "sources": [ + "URL: https://example1.com, Title {AI Breakthrough; Latest in AI}", + "URL: https://example2.com, Title {Advancements in AI; AI Trends}" + ] +} + From 1a3175dac754b9f7570f49b61168eea3e6d28e72 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 30 Jun 2025 02:48:01 -0700 Subject: [PATCH 15/33] delete print --- src/mmore/websearchRAG/pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index d8e169d2..d07f0ef5 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -283,7 +283,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: combined_web_summaries = "\n".join([str(s) if s else "" for s in web_summaries]) web_summary_all = self.generate_summary(combined_web_summaries, qr) - print(web_summary_all) # Current context, web content to generate the answer out = self.integrate_with_llm(qr, context_for_llm, snippets) final_short, final_detailed = out["short"], out["detailed"] From 7672f196b5b8533da0e6e5a1a2e997381bd09d21 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 30 Jun 2025 13:56:29 -0700 Subject: [PATCH 16/33] corrected import --- docs/websearch.md | 6 ++++-- examples/websearchRAG/config_api.yaml | 2 +- src/mmore/cli.py | 17 +++++++++++++++++ src/mmore/run_websearch.py | 17 +++-------------- src/mmore/websearchRAG/logging_config.py | 2 +- src/mmore/websearchRAG/websearch.py | 5 ++++- 6 files changed, 30 insertions(+), 19 deletions(-) diff --git a/docs/websearch.md b/docs/websearch.md index 454fea46..52e6b792 100644 --- a/docs/websearch.md +++ b/docs/websearch.md @@ -33,10 +33,10 @@ Based on the implementation of the `RAG` module, the `Websearch` module enables 1. **API**: Creates a server hosting the pipeline 2. **Local**: Runs the inference locally -You can customize various parts of the pipeline by defining [an inference RAG configuration file](/examples/rag/api/rag_api.yaml). +You can customize various parts of the pipeline by defining [an inference Websearch configuration file](/examples/websearchRAG/config_api.yaml). -Users can adjust the pipeline according to their [requirements](/examples/websearch/config.yaml) through the following parameters: +Users can adjust the pipeline according to their [requirements](/examples/websearchRAG/config.yaml) through the following parameters: - `use_rag`: Enables or disables RAG retrieval. - `use_summary`: Activates summarization of retrieved web snippets. @@ -71,9 +71,11 @@ Here is a example to create a Websearch pipeline hosted through [LangServe](http } }' ``` + In the API mode, it is necessary to provide the `use_rag` and `use_summary`parameters in the query, the number of `n_loops` and `n_subqueries` are defined in the config file. In local mode, the pipeline is run directly with the input data specified in the configuration file and the result is saved at the specified path. + For both mode, if we want to use the RAG pipeline, it is necessay to provide the path to the rag configuration file. ## Results and Outputs diff --git a/examples/websearchRAG/config_api.yaml b/examples/websearchRAG/config_api.yaml index e2564c8f..03cb2469 100644 --- a/examples/websearchRAG/config_api.yaml +++ b/examples/websearchRAG/config_api.yaml @@ -6,7 +6,7 @@ websearch: mode: api llm_config: llm_name: OpenMeditron/meditron3-8b - max_new_tokens: 250 + max_new_tokens: 1200 # Mode Config mode_args: diff --git a/src/mmore/cli.py b/src/mmore/cli.py index ec9a7f04..631451d4 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -1,6 +1,7 @@ from typing import Optional import click +import yaml @click.group() @@ -224,6 +225,22 @@ def index_api(config_file, host, port): run_api(config_file, host, port) +@main.command() +@click.option( + "--config-file", + type=str, + required=True, + help="Path to the Websearch configuration file (YAML).", +) +def websearch(config_file): + """Run the Websearch (+ optional RAG) pipeline.""" + from .run_websearch import run_websearch + + # Load your YAML configuration and pass it into the runner + with open(config_file, "r") as f: + config_dict = yaml.safe_load(f) + + run_websearch(config_dict) @main.command() @click.option( diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 8b6bce8b..569d5103 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -1,28 +1,22 @@ # mmore/run_websearch.py import argparse -import logging import time import torch -from dataclasses import dataclass, asdict, fields -from typing import Any, Dict, Optional, Union, cast +from dataclasses import dataclass +from typing import Optional, Union import uvicorn from fastapi import FastAPI from pydantic import BaseModel, Field -from langserve import add_routes from dotenv import load_dotenv -from .websearchRAG.logging_config import logger from .utils import load_config from .websearchRAG.config import WebsearchConfig from .websearchRAG.pipeline import WebsearchPipeline -from .rag.pipeline import RAGConfig, RAGPipeline +from .rag.pipeline import RAGPipeline from .run_rag import LocalConfig, APIConfig, RAGInferenceConfig -from .run_rag import create_api as create_api_rag - - from .websearchRAG.logging_config import logger @@ -47,7 +41,6 @@ def __post_init__(self): def run_websearch(config_file): - # 1) Load config cfg = load_config(config_file, WebsearchInferenceConfig) ws = cfg.websearch @@ -67,7 +60,6 @@ def run_websearch(config_file): raise ValueError(f"Unknown mode: {cfg.mode!r}. Must be 'local' or 'api'.") - class QueryInput(BaseModel): input: str = Field(..., description="The user query") collection_name: Optional[str] = Field( @@ -92,9 +84,6 @@ class WebQuery(BaseModel): ) - - - def create_api(config_file: str): app = FastAPI( title="mmore Websearch API", diff --git a/src/mmore/websearchRAG/logging_config.py b/src/mmore/websearchRAG/logging_config.py index ac9f7f0c..7cdd44be 100644 --- a/src/mmore/websearchRAG/logging_config.py +++ b/src/mmore/websearchRAG/logging_config.py @@ -21,4 +21,4 @@ file_handler.setFormatter(formatter) # Add file handler to logger - logger.addHandler(file_handler) \ No newline at end of file + logger.addHandler(file_handler) diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index 7edda06b..d443e999 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -1,5 +1,7 @@ -from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict +from langchain_community.tools import DuckDuckGoSearchResults +from langchain_community.utilities import DuckDuckGoSearchAPIWrapper +from ..rag.llm import LLM class WebsearchOnly: """Class dedicated to performing web searches and validating their usefulness.""" @@ -24,3 +26,4 @@ def resume_web_search(self, query: str, web_output: str) -> str: ) response = llm.invoke(prompt) return response.strip() + From 9e605580aab61b5aa33d197bcfac0855d70a29f1 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Fri, 4 Jul 2025 11:06:48 -0700 Subject: [PATCH 17/33] corrected link --- docs/websearch.md | 41 ++++++++++++++++-------------- src/mmore/websearchRAG/pipeline.py | 8 ++---- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/docs/websearch.md b/docs/websearch.md index 52e6b792..865adb54 100644 --- a/docs/websearch.md +++ b/docs/websearch.md @@ -11,6 +11,24 @@ The WebSearch integration uses the DuckDuckGo search API through a wrapper provi - **DuckDuckGo Search:** Retrieves external information through concise subqueries generated by a LLm - **LLM Integration:** Summarizes and integrates retrieved web snippets with RAG results to provide a final answer +## Customization + +Based on the implementation of the `RAG` module, the `Websearch` module enables the creation of a modular Websearch inference pipeline for your indexed multimodal documents, using two inference modes: + 1. **API**: Creates a server hosting the pipeline + 2. **Local**: Runs the inference locally + +You can customize various parts of the pipeline by defining [an inference Websearch configuration file](/examples/websearchRAG/config_api.yaml). + + +Users can adjust the pipeline according to their [requirements](/examples/websearchRAG/config.yaml) through the following parameters: + +- `use_rag`: Enables or disables RAG retrieval. +- `use_summary`: Activates summarization of retrieved web snippets. +- `n_loops`: Defines the number of search iterations to refine results. +- `n_subqueries`: Specifies the number of subqueries generated for each input query. + + + ### Workflow 0. **RAG pipeline:** @@ -27,21 +45,6 @@ The WebSearch integration uses the DuckDuckGo search API through a wrapper provi - We loop again from step 1 with the updated current knowledge -## Customization - -Based on the implementation of the `RAG` module, the `Websearch` module enables the creation of a modular Websearch inference pipeline for your indexed multimodal documents, using two inference modes: - 1. **API**: Creates a server hosting the pipeline - 2. **Local**: Runs the inference locally - -You can customize various parts of the pipeline by defining [an inference Websearch configuration file](/examples/websearchRAG/config_api.yaml). - - -Users can adjust the pipeline according to their [requirements](/examples/websearchRAG/config.yaml) through the following parameters: - -- `use_rag`: Enables or disables RAG retrieval. -- `use_summary`: Activates summarization of retrieved web snippets. -- `n_loops`: Defines the number of search iterations to refine results. -- `n_subqueries`: Specifies the number of subqueries generated for each input query. @@ -99,9 +102,9 @@ The pipeline provides outputs in the following structure: "web_summary": "Summarized web search information.", "short_answer": "AI advancements include new models and real-time applications.", "detailed_answer": "Recent AI developments include breakthroughs in large language models and innovative real-time applications, supported by diverse resources from RAG and web search.", - "sources": [ - "URL: https://example1.com, Title {AI Breakthrough; Latest in AI}", - "URL: https://example2.com, Title {Advancements in AI; AI Trends}" - ] + "sources": { + "https://example1.com" : ["AI Breakthrough; Latest in AI"], + "https://example2.com" : ["Advancements in AI; AI Trends"], + } } diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index d07f0ef5..2b4cbbbd 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -4,21 +4,18 @@ import re from pathlib import Path from typing import Dict, List, Any, Optional, Set -import logging import time import tempfile import os from dataclasses import dataclass,asdict from langchain_community.tools import DuckDuckGoSearchResults -from duckduckgo_search.exceptions import RatelimitException from langchain_community.utilities import DuckDuckGoSearchAPIWrapper from langchain_core.messages import HumanMessage, SystemMessage from ..run_rag import rag -from ..run_rag import read_queries from ..rag.llm import LLM, LLMConfig from .config import WebsearchConfig @@ -221,7 +218,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None - all_sources: Set[str] = set() source_map = {} current_context = rag_summary final_short, final_detailed = "", "" @@ -237,7 +233,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: else: subs = self.generate_subqueries(qr) # Based on original query only - snippets, urls = [], [] + snippets = [] subquery_summaries = [] @@ -386,4 +382,4 @@ def _save_query_as_json(self, query): else: temp_file.write(json.dumps(query.dict() if hasattr(query, "dict") else query) + '\n') logger.info(f"Query saved to temporary file: {temp_file.name}") - return temp_file.name \ No newline at end of file + return temp_file.name From ede3fd09bbc23e999cb1983a67d74d86ceadfc1d Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Sun, 13 Jul 2025 10:44:56 -0700 Subject: [PATCH 18/33] Switch LangServe to LangGraph --- src/mmore/run_rag.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/mmore/run_rag.py b/src/mmore/run_rag.py index 40bbbae9..ca113a47 100644 --- a/src/mmore/run_rag.py +++ b/src/mmore/run_rag.py @@ -8,7 +8,7 @@ import uvicorn from dotenv import load_dotenv from fastapi import FastAPI -from langserve import add_routes +from pydantic import BaseModel from mmore.rag.pipeline import RAGConfig, RAGPipeline from mmore.utils import load_config @@ -62,15 +62,31 @@ def save_results(results: List[Dict], output_file: Union[Path, str]): json.dump(results, f, indent=2) +class InnerInput(BaseModel): + input: str + collection_name: Optional[str] = None + +class RAGInput(BaseModel): + input: InnerInput + +class RAGOutput(BaseModel): + input: Optional[str] = None + context: Optional[str] = None + answer: Optional[str] = None + def create_api(rag: RAGPipeline, endpoint: str): app = FastAPI( title="RAG Pipeline API", description="API for question answering using RAG", - version="1.0", + version="2.0", ) - # Add routes for the RAG chain - add_routes(app, rag.rag_chain, path=endpoint, playground_type="chat") + @app.post(endpoint, response_model=RAGOutput) + async def run_rag(request: RAGInput): + # Extract the inner input dict to pass to rag_chain + pipeline_input = request.input.model_dump() + output_dict = rag.rag_chain.invoke(pipeline_input) + return RAGOutput(**output_dict) @app.get("/health") def health_check(): @@ -80,7 +96,7 @@ def health_check(): def rag(config_file): - """Run RAG.""" + """Run RAG in local or API""" config = load_config(config_file, RAGInferenceConfig) logger.info("Creating the RAG Pipeline...") @@ -93,14 +109,16 @@ def rag(config_file): queries = read_queries(config_args.input_file) results = rag_pp(queries, return_dict=True) save_results(results, config_args.output_file) + elif config.mode == "api": config_args = cast(APIConfig, config.mode_args) app = create_api(rag_pp, config_args.endpoint) uvicorn.run(app, host=config_args.host, port=config_args.port) + else: raise ValueError( - f"Unknown inference mode: {config.mode}. Should be in [api, local]" + f"Unknown mode: {config.mode}. Should be either api or local" ) From c62ef31a6b76e3ed7e5eb3455867db152bb52207 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Mon, 14 Jul 2025 08:17:29 -0700 Subject: [PATCH 19/33] ruff + isort check done --- src/mmore/run_websearch.py | 20 +++++----- src/mmore/websearchRAG/config.py | 18 ++++----- src/mmore/websearchRAG/pipeline.py | 58 +++++++++++++---------------- src/mmore/websearchRAG/websearch.py | 7 +++- 4 files changed, 49 insertions(+), 54 deletions(-) diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 569d5103..d59fe1f6 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -2,23 +2,21 @@ import argparse import time -import torch from dataclasses import dataclass from typing import Optional, Union +import torch import uvicorn +from dotenv import load_dotenv from fastapi import FastAPI from pydantic import BaseModel, Field -from dotenv import load_dotenv +from .rag.pipeline import RAGPipeline +from .run_rag import APIConfig, LocalConfig, RAGInferenceConfig from .utils import load_config from .websearchRAG.config import WebsearchConfig +from .websearchRAG.logging_config import logger from .websearchRAG.pipeline import WebsearchPipeline -from .rag.pipeline import RAGPipeline -from .run_rag import LocalConfig, APIConfig, RAGInferenceConfig - -from .websearchRAG.logging_config import logger - load_dotenv() @@ -83,7 +81,7 @@ class WebQuery(BaseModel): example=False ) - + def create_api(config_file: str): app = FastAPI( title="mmore Websearch API", @@ -109,13 +107,13 @@ def websearch(query: WebQuery): if query.use_rag: logger.info("Launch RAG") - config_RAG = load_config(config_file.websearch.rag_config_path, RAGInferenceConfig) + config_rag = load_config(config_file.websearch.rag_config_path, RAGInferenceConfig) logger.info("Creating the RAG Pipeline...") - rag_pp = RAGPipeline.from_config(config_RAG.rag) + rag_pp = RAGPipeline.from_config(config_rag.rag) data = rag_pp([query.query.dict()], return_dict=True) logger.info("RAG done") logger.info("##RAG##", data) - else: + else: data = query.query logger.info("Launch websearch") diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index 3982699e..a4e7d047 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -1,8 +1,8 @@ # mmore/websearch/config.py - from dataclasses import dataclass, field -from typing import Any, Dict, Optional, Literal from pathlib import Path +from typing import Any, Dict, Literal, Optional + import yaml from ..rag.llm import LLMConfig # Reuse the same LLMConfig as RAG @@ -37,20 +37,20 @@ class WebsearchConfig: n_loops: int = 2 max_searches: int = 10 llm_config: Dict[str, Any] = field(default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200}) - mode: Literal["local", "api"] = "local" + mode: Literal["local", "api"] = "local" def __post_init__(self): - required_fields = ["rag_config_path", "llm_config", "mode"] - for field in required_fields: - if not getattr(self, field): - raise ValueError(f"'{field}' is a required field.") + required_fields = ["n_loops","n_subqueries", "max_searches", "mode"] + for field_name in required_fields: + if not getattr(self, field_name): + raise ValueError(f"'{field_name}' is a required field.") def get_llm_config(self) -> LLMConfig: """ Convert the nested llm_config dict into an instance of rag.llm.LLMConfig. """ return LLMConfig(**self.llm_config) - + def access_rag_config(self) -> Dict[str, Any]: """ Access and parse the RAG configuration file defined in `rag_config_path`. @@ -71,4 +71,4 @@ def access_rag_config(self) -> Dict[str, Any]: with open(rag_config_full_path, "r") as file: rag_config = yaml.safe_load(file) - return rag_config \ No newline at end of file + return rag_config diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 2b4cbbbd..bc80e90f 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -1,24 +1,20 @@ -from .logging_config import logger - import json +import os import re -from pathlib import Path -from typing import Dict, List, Any, Optional, Set -import time import tempfile -import os -from dataclasses import dataclass,asdict +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional from langchain_community.tools import DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper - - from langchain_core.messages import HumanMessage, SystemMessage -from ..run_rag import rag from ..rag.llm import LLM, LLMConfig +from ..run_rag import rag from .config import WebsearchConfig - +from .logging_config import logger @dataclass @@ -74,7 +70,7 @@ def generate_summary(self, rag_answer, query: str): "Answer: \n" "---------------------------" ) - + messages = [ SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), HumanMessage(content=prompt), @@ -96,7 +92,7 @@ def evaluate_subquery_relevance(self, query, current_subqueries, previous_subque ] response = self.llm.invoke(messages) response = self._clean_llm_output(response.content) - + if 'no' in response : return False else : @@ -105,13 +101,13 @@ def evaluate_subquery_relevance(self, query, current_subqueries, previous_subque def _clean_llm_output(self, content: str): delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - + if delimiter not in content: return content # Extract the section after the delimiter cleaned_section = content.split(delimiter, 1)[-1].lower().strip() - + return cleaned_section @@ -150,8 +146,8 @@ def generate_subqueries( cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) return cleaned_answer - - + + def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: """ @@ -167,9 +163,9 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: snippet = r.get("snippet", "") url = r.get("link", "") # note: it's "link" in LangChain results title = r.get("title", "") - + formatted_results.append({"snippet": snippet, "url": url, "title" : title}) - + return formatted_results except Exception as e: logger.error(f"DuckDuckGo search error: {e}") @@ -193,7 +189,7 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st resp = self.llm.invoke(msgs) # parse clean_content = self._clean_llm_output(resp.content) - + sa_matches = re.findall( r"short answer:\s*(.*?)(?=detailed answer:)", clean_content, @@ -217,7 +213,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: self.rag_results = rag_ans rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None - + source_map = {} current_context = rag_summary final_short, final_detailed = "", "" @@ -239,7 +235,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: if loop > 0 and not self.evaluate_subquery_relevance(qr, subs, previous_sub): break - + for sq in subs: time.sleep(10) res = self.duckduckgo_search(query = sq) @@ -248,16 +244,16 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: for r in res: if r["url"] not in source_map: - source_map[r["url"]] = [] + source_map[r["url"]] = [] if r["title"] not in source_map[r["url"]]: - source_map[r["url"]].append(r["title"]) + source_map[r["url"]].append(r["title"]) snippet = f"{r['snippet']})" snippets.append(snippet) subquery_snippets.append(snippet) - + combined_snippets = "\n".join(subquery_snippets) summary = self.generate_summary(combined_snippets, sq) @@ -269,7 +265,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: web_summary = self.generate_summary(combined_sub_summaries, qr) web_summaries.append(web_summary) - if self.config.use_summary: + if self.config.use_summary: # Combine rag summary, web summary, and original query for final answer context_for_llm = f"RAG informations:\n{rag_summary or ''}\n\nWeb informations:\n{web_summary}" else: @@ -320,7 +316,7 @@ def run(self): outputs = [] - outputs = [self.process_record(rec) for rec in data] + outputs = [self.process_record(rec) for rec in data] # save @@ -336,12 +332,10 @@ def run(self): def run_api(self, use_rag, use_summary, query): """ Process queries and handle them with a temporary JSONL file. - Parameters: - use_rag (bool): Indicates whether to use RAG. - use_summary (bool): Indicates whether to use summarization. - query (list): List of query dictionaries. - Returns: - List of processed query results. """ @@ -350,11 +344,11 @@ def run_api(self, use_rag, use_summary, query): self.config.use_summary = use_summary temp_file_path = self._save_query_as_json(query) - + try: outputs = [] # Read from the temporary JSONL file - with open(temp_file_path, 'r', encoding='utf-8') as f: + with open(temp_file_path, 'r', encoding='utf-8') as f: if self.config.use_rag : for line in f: record = json.loads(line) @@ -363,7 +357,7 @@ def run_api(self, use_rag, use_summary, query): for line in f: record = json.loads(line.strip()) outputs.append(self.process_record(record)) - + return outputs finally: diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index d443e999..42bf9d23 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -1,8 +1,11 @@ -from typing import List, Dict +from typing import Dict + from langchain_community.tools import DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper + from ..rag.llm import LLM + class WebsearchOnly: """Class dedicated to performing web searches and validating their usefulness.""" @@ -18,7 +21,7 @@ def websearch_pipeline(self, query: str) -> Dict[str, str]: def resume_web_search(self, query: str, web_output: str) -> str: """Call LLM to resume the current web output based on the original query, return a summary of the web search and the source.""" - llm = LLM() + llm = LLM() prompt = ( f"Original Query: '{query}'\n" f"Web content: '{web_output}'\n" From 8c99ece4a18e7befdb25ad4f6275cb7ad6456d99 Mon Sep 17 00:00:00 2001 From: fabnemEPFL Date: Mon, 4 Aug 2025 14:37:35 +0200 Subject: [PATCH 20/33] ruff format --- src/mmore/cli.py | 2 + src/mmore/run_rag.py | 7 +- src/mmore/run_websearch.py | 29 ++--- src/mmore/websearchRAG/config.py | 10 +- src/mmore/websearchRAG/logging_config.py | 6 +- src/mmore/websearchRAG/pipeline.py | 149 ++++++++++++----------- src/mmore/websearchRAG/websearch.py | 7 +- 7 files changed, 109 insertions(+), 101 deletions(-) diff --git a/src/mmore/cli.py b/src/mmore/cli.py index 631451d4..99b4b07b 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -225,6 +225,7 @@ def index_api(config_file, host, port): run_api(config_file, host, port) + @main.command() @click.option( "--config-file", @@ -242,6 +243,7 @@ def websearch(config_file): run_websearch(config_dict) + @main.command() @click.option( "--host", diff --git a/src/mmore/run_rag.py b/src/mmore/run_rag.py index ca113a47..1c08f3e8 100644 --- a/src/mmore/run_rag.py +++ b/src/mmore/run_rag.py @@ -66,14 +66,17 @@ class InnerInput(BaseModel): input: str collection_name: Optional[str] = None + class RAGInput(BaseModel): input: InnerInput + class RAGOutput(BaseModel): input: Optional[str] = None context: Optional[str] = None answer: Optional[str] = None + def create_api(rag: RAGPipeline, endpoint: str): app = FastAPI( title="RAG Pipeline API", @@ -117,9 +120,7 @@ def rag(config_file): uvicorn.run(app, host=config_args.host, port=config_args.port) else: - raise ValueError( - f"Unknown mode: {config.mode}. Should be either api or local" - ) + raise ValueError(f"Unknown mode: {config.mode}. Should be either api or local") if __name__ == "__main__": diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index d59fe1f6..2f47bee5 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -27,7 +27,6 @@ torch.backends.cuda.enable_math_sdp(True) - @dataclass class WebsearchInferenceConfig: websearch: WebsearchConfig @@ -67,18 +66,11 @@ class QueryInput(BaseModel): class WebQuery(BaseModel): query: QueryInput = Field( - ..., - description="Search query with input and optional collection name" - ) - use_rag: bool = Field( - False, - description="Include RAG context", - example=True + ..., description="Search query with input and optional collection name" ) + use_rag: bool = Field(False, description="Include RAG context", example=True) use_summary: bool = Field( - True, - description="Enable subquery summary", - example=False + True, description="Enable subquery summary", example=False ) @@ -101,13 +93,15 @@ def create_api(config_file: str): @app.post("/websearch") # query = query parameter def websearch(query: WebQuery): - #charge la pipeline directement depuis rag_pp - #changer le config_file avec le config file du rag --> ajouter ce que l'utilisateur demande + # charge la pipeline directement depuis rag_pp + # changer le config_file avec le config file du rag --> ajouter ce que l'utilisateur demande pipeline = WebsearchPipeline(config=config_file.websearch) if query.use_rag: logger.info("Launch RAG") - config_rag = load_config(config_file.websearch.rag_config_path, RAGInferenceConfig) + config_rag = load_config( + config_file.websearch.rag_config_path, RAGInferenceConfig + ) logger.info("Creating the RAG Pipeline...") rag_pp = RAGPipeline.from_config(config_rag.rag) data = rag_pp([query.query.dict()], return_dict=True) @@ -121,19 +115,20 @@ def websearch(query: WebQuery): answers = pipeline.run_api(query.use_rag, query.use_summary, data) logger.info("Websearch done") - return answers return app if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the Websearch (+ optional RAG) pipeline.") + parser = argparse.ArgumentParser( + description="Run the Websearch (+ optional RAG) pipeline." + ) parser.add_argument( "--config-file", type=str, required=True, - help="Path to the Websearch configuration file (YAML)." + help="Path to the Websearch configuration file (YAML).", ) args = parser.parse_args() run_websearch(args.config_file) diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index a4e7d047..1af5f53d 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -36,11 +36,13 @@ class WebsearchConfig: n_subqueries: int = 3 n_loops: int = 2 max_searches: int = 10 - llm_config: Dict[str, Any] = field(default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200}) + llm_config: Dict[str, Any] = field( + default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200} + ) mode: Literal["local", "api"] = "local" def __post_init__(self): - required_fields = ["n_loops","n_subqueries", "max_searches", "mode"] + required_fields = ["n_loops", "n_subqueries", "max_searches", "mode"] for field_name in required_fields: if not getattr(self, field_name): raise ValueError(f"'{field_name}' is a required field.") @@ -65,7 +67,9 @@ def access_rag_config(self) -> Dict[str, Any]: rag_config_full_path = Path(self.rag_config_path) if not rag_config_full_path.exists(): - raise FileNotFoundError(f"RAG config file not found at {rag_config_full_path}") + raise FileNotFoundError( + f"RAG config file not found at {rag_config_full_path}" + ) # Load the RAG configuration with open(rag_config_full_path, "r") as file: diff --git a/src/mmore/websearchRAG/logging_config.py b/src/mmore/websearchRAG/logging_config.py index 7cdd44be..7c772ff9 100644 --- a/src/mmore/websearchRAG/logging_config.py +++ b/src/mmore/websearchRAG/logging_config.py @@ -13,11 +13,13 @@ # Prevent multiple handlers if the logger is configured multiple times if not logger.handlers: # Create a file handler to log to a file - file_handler = logging.FileHandler('shared_log_file.log') + file_handler = logging.FileHandler("shared_log_file.log") file_handler.setLevel(logging.DEBUG) # Define log format - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) file_handler.setFormatter(formatter) # Add file handler to logger diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index bc80e90f..4a60090e 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -28,7 +28,6 @@ class ProcessedResponse: sources: Dict[str, Any] # Maps URLs to lists of titles - class WebsearchPipeline: """ Pipeline for running RAG and iterative websearch loops, @@ -40,22 +39,24 @@ def __init__(self, config: WebsearchConfig): self.llm = self._initialize_llm() self.rag_results = None self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches) - self.search = DuckDuckGoSearchResults(api_wrapper=self.wrapper, output_format="list") - + self.search = DuckDuckGoSearchResults( + api_wrapper=self.wrapper, output_format="list" + ) def _initialize_llm(self) -> LLM: - if self.config.use_rag : + if self.config.use_rag: rag_cfg = self.config.access_rag_config() llm_conf = rag_cfg.get("rag", {}).get("llm") if llm_conf is None: - raise ValueError("Missing 'llm' config under 'rag' in RAG configuration.") + raise ValueError( + "Missing 'llm' config under 'rag' in RAG configuration." + ) return LLM.from_config(LLMConfig(**llm_conf)) - else : + else: base_conf = self.config.get_llm_config() base_conf = base_conf.__dict__ return LLM.from_config(LLMConfig(**base_conf)) - def generate_summary(self, rag_answer, query: str): """ Summarize the RAG answer (used when rag_summary=True) @@ -72,13 +73,17 @@ def generate_summary(self, rag_answer, query: str): ) messages = [ - SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), + SystemMessage( + content="You are a helpful assistant that summarizes text relevant to the question." + ), HumanMessage(content=prompt), ] response = self.llm.invoke(messages) return self._clean_llm_output(response.content) - def evaluate_subquery_relevance(self, query, current_subqueries, previous_subqueries): + def evaluate_subquery_relevance( + self, query, current_subqueries, previous_subqueries + ): prompt = ( f"Original query:\n{query}\n\n" f"Previous subqueries that contribute to understanding:\n{previous_subqueries}\n\n" @@ -93,12 +98,11 @@ def evaluate_subquery_relevance(self, query, current_subqueries, previous_subque response = self.llm.invoke(messages) response = self._clean_llm_output(response.content) - if 'no' in response : + if "no" in response: return False - else : + else: return True - def _clean_llm_output(self, content: str): delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" @@ -110,11 +114,8 @@ def _clean_llm_output(self, content: str): return cleaned_section - def generate_subqueries( - self, - original_query: str, - current_context: Optional[str] = None + self, original_query: str, current_context: Optional[str] = None ) -> List[str]: """ Generate concise search subqueries @@ -134,10 +135,11 @@ def generate_subqueries( f"---ANSWER ---" ) - prompt = instruction + task messages = [ - SystemMessage(content="You are an assistant specializing in generating search queries."), + SystemMessage( + content="You are an assistant specializing in generating search queries." + ), HumanMessage(content=prompt), ] @@ -146,9 +148,6 @@ def generate_subqueries( cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) return cleaned_answer - - - def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: """ Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper @@ -164,28 +163,33 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: url = r.get("link", "") # note: it's "link" in LangChain results title = r.get("title", "") - formatted_results.append({"snippet": snippet, "url": url, "title" : title}) + formatted_results.append( + {"snippet": snippet, "url": url, "title": title} + ) return formatted_results except Exception as e: logger.error(f"DuckDuckGo search error: {e}") return [] - - def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[str]) -> Dict[str, str]: + def integrate_with_llm( + self, original: str, rag_doc: str, web_snippets: List[str] + ) -> Dict[str, str]: # Build prompt for short & detailed answer sources = "\n".join(web_snippets) prompt = ( - f"Original Query: {original}\n" - f"RAG Document Information:\n{rag_doc}\n\n" - f"Web Information:\n{sources}\n\n" - "Provide the response in the following format:\n" - "short answer: \n" - "detailed answer: " - ) - + f"Original Query: {original}\n" + f"RAG Document Information:\n{rag_doc}\n\n" + f"Web Information:\n{sources}\n\n" + "Provide the response in the following format:\n" + "short answer: \n" + "detailed answer: " + ) - msgs = [SystemMessage(content="You are a research assistant."), HumanMessage(content=prompt)] + msgs = [ + SystemMessage(content="You are a research assistant."), + HumanMessage(content=prompt), + ] resp = self.llm.invoke(msgs) # parse clean_content = self._clean_llm_output(resp.content) @@ -193,26 +197,23 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st sa_matches = re.findall( r"short answer:\s*(.*?)(?=detailed answer:)", clean_content, - flags=re.IGNORECASE|re.DOTALL + flags=re.IGNORECASE | re.DOTALL, ) da_matches = re.findall( - r"detailed answer:\s*(.*)", - clean_content, - flags=re.IGNORECASE|re.DOTALL + r"detailed answer:\s*(.*)", clean_content, flags=re.IGNORECASE | re.DOTALL ) short = sa_matches[-1].strip().rstrip(",") if sa_matches else "" detailed = da_matches[-1].strip() if da_matches else "" return {"short": short, "detailed": detailed} - - def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: qr = rec.get("input", "").strip() rag_ans = rec.get("answer", "") if self.config.use_rag else "" self.rag_results = rag_ans - rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None - + rag_summary = ( + self.generate_summary(rag_ans, qr) if self.config.use_rag else None + ) source_map = {} current_context = rag_summary @@ -222,7 +223,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: web_summaries = [] previous_sub = [] - for loop in range(self.config.n_loops): if self.config.use_rag: subs = self.generate_subqueries(qr, current_context) @@ -232,13 +232,14 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: snippets = [] subquery_summaries = [] - - if loop > 0 and not self.evaluate_subquery_relevance(qr, subs, previous_sub): + if loop > 0 and not self.evaluate_subquery_relevance( + qr, subs, previous_sub + ): break for sq in subs: time.sleep(10) - res = self.duckduckgo_search(query = sq) + res = self.duckduckgo_search(query=sq) subquery_snippets = [] @@ -253,7 +254,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: snippets.append(snippet) subquery_snippets.append(snippet) - combined_snippets = "\n".join(subquery_snippets) summary = self.generate_summary(combined_snippets, sq) @@ -261,7 +261,9 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: previous_sub = subs - combined_sub_summaries = "\n".join([str(s) if s else "" for s in subquery_summaries]) + combined_sub_summaries = "\n".join( + [str(s) if s else "" for s in subquery_summaries] + ) web_summary = self.generate_summary(combined_sub_summaries, qr) web_summaries.append(web_summary) @@ -272,7 +274,9 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: # If not summarizing subqueries, use rag summary or current context with snippets context_for_llm = current_context - combined_web_summaries = "\n".join([str(s) if s else "" for s in web_summaries]) + combined_web_summaries = "\n".join( + [str(s) if s else "" for s in web_summaries] + ) web_summary_all = self.generate_summary(combined_web_summaries, qr) # Current context, web content to generate the answer @@ -282,20 +286,18 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: # Prepare context for next search loop current_context = final_detailed - solution = ProcessedResponse( - query=qr, - rag_informations=self.rag_results, - rag_summary=rag_summary if self.config.use_rag else None, - web_summary=web_summary_all, - short_answer=final_short, - detailed_answer=final_detailed, - sources=source_map + solution = ProcessedResponse( + query=qr, + rag_informations=self.rag_results, + rag_summary=rag_summary if self.config.use_rag else None, + web_summary=web_summary_all, + short_answer=final_short, + detailed_answer=final_detailed, + sources=source_map, ) return asdict(solution) - - def run(self): # RAG pipeline if self.config.use_rag: @@ -305,30 +307,25 @@ def run(self): rag(self.config.rag_config_path) rc = self.config.access_rag_config() self.config.input_file = rc["mode_args"]["output_file"] - with open(self.config.input_file, 'r', encoding='utf-8') as f: + with open(self.config.input_file, "r", encoding="utf-8") as f: data = json.load(f) else: self.config.input_file = self.config.input_queries data = [] - with open(self.config.input_file, 'r', encoding='utf-8') as f: + with open(self.config.input_file, "r", encoding="utf-8") as f: for line in f: data.append(json.loads(line.strip())) # JSONL format - outputs = [] outputs = [self.process_record(rec) for rec in data] - # save outp = Path(self.config.output_file) outp.parent.mkdir(exist_ok=True, parents=True) - with open(outp, 'w', encoding='utf-8') as f: + with open(outp, "w", encoding="utf-8") as f: json.dump(outputs, f, ensure_ascii=False, indent=2) logger.info(f"Results saved to {outp}") - - - def run_api(self, use_rag, use_summary, query): """ Process queries and handle them with a temporary JSONL file. @@ -348,8 +345,8 @@ def run_api(self, use_rag, use_summary, query): try: outputs = [] # Read from the temporary JSONL file - with open(temp_file_path, 'r', encoding='utf-8') as f: - if self.config.use_rag : + with open(temp_file_path, "r", encoding="utf-8") as f: + if self.config.use_rag: for line in f: record = json.loads(line) outputs.append(self.process_record(record)) @@ -365,15 +362,21 @@ def run_api(self, use_rag, use_summary, query): logger.info(f"Deleting temporary file: {temp_file_path}") os.remove(temp_file_path) - def _save_query_as_json(self, query): """Save query to a temporary JSONL file and return the file path.""" - suffix = '.json' if self.config.use_rag else '.jsonl' - with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False) as temp_file: + suffix = ".json" if self.config.use_rag else ".jsonl" + with tempfile.NamedTemporaryFile( + mode="w", suffix=suffix, delete=False + ) as temp_file: # Convert Pydantic models to dictionaries if needed if isinstance(query, list): - temp_file.writelines(json.dumps(q.dict() if hasattr(q, "dict") else q) + '\n' for q in query) + temp_file.writelines( + json.dumps(q.dict() if hasattr(q, "dict") else q) + "\n" + for q in query + ) else: - temp_file.write(json.dumps(query.dict() if hasattr(query, "dict") else query) + '\n') + temp_file.write( + json.dumps(query.dict() if hasattr(query, "dict") else query) + "\n" + ) logger.info(f"Query saved to temporary file: {temp_file.name}") return temp_file.name diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index 42bf9d23..7f244b93 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -9,9 +9,11 @@ class WebsearchOnly: """Class dedicated to performing web searches and validating their usefulness.""" - def __init__(self, region: str = 'wt-wt', max_results: int = 10): + def __init__(self, region: str = "wt-wt", max_results: int = 10): """Initialize the WebsearchOnly class with search parameters.""" - self.wrapper = DuckDuckGoSearchAPIWrapper(region=region, max_results=max_results) + self.wrapper = DuckDuckGoSearchAPIWrapper( + region=region, max_results=max_results + ) def websearch_pipeline(self, query: str) -> Dict[str, str]: """Perform a single web search.""" @@ -29,4 +31,3 @@ def resume_web_search(self, query: str, web_output: str) -> str: ) response = llm.invoke(prompt) return response.strip() - From fb1247310304a9c6dbaba89dc467b2bdf679c009 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Fri, 8 Aug 2025 20:16:16 -0700 Subject: [PATCH 21/33] changed import --- src/mmore/cli.py | 10 +- src/mmore/run_websearch.py | 36 ++++--- src/mmore/websearchRAG/pipeline.py | 149 ++++++++++++++-------------- src/mmore/websearchRAG/websearch.py | 7 +- 4 files changed, 103 insertions(+), 99 deletions(-) diff --git a/src/mmore/cli.py b/src/mmore/cli.py index 631451d4..f8b62439 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -225,6 +225,7 @@ def index_api(config_file, host, port): run_api(config_file, host, port) + @main.command() @click.option( "--config-file", @@ -236,11 +237,12 @@ def websearch(config_file): """Run the Websearch (+ optional RAG) pipeline.""" from .run_websearch import run_websearch - # Load your YAML configuration and pass it into the runner - with open(config_file, "r") as f: - config_dict = yaml.safe_load(f) + # # Load your YAML configuration and pass it into the runner + # with open(config_file, "r") as f: + # config_dict = yaml.safe_load(f) + + run_websearch(config_file) - run_websearch(config_dict) @main.command() @click.option( diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index d59fe1f6..fc907d56 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -7,6 +7,7 @@ import torch import uvicorn +import yaml from dotenv import load_dotenv from fastapi import FastAPI from pydantic import BaseModel, Field @@ -27,7 +28,6 @@ torch.backends.cuda.enable_math_sdp(True) - @dataclass class WebsearchInferenceConfig: websearch: WebsearchConfig @@ -39,8 +39,10 @@ def __post_init__(self): def run_websearch(config_file): - # 1) Load config - cfg = load_config(config_file, WebsearchInferenceConfig) + with open(config_file, "r") as f: + config_dict = yaml.safe_load(f) + + cfg = load_config(config_dict, WebsearchInferenceConfig) ws = cfg.websearch if ws.mode == "local": pipeline = WebsearchPipeline(config=ws) @@ -67,18 +69,11 @@ class QueryInput(BaseModel): class WebQuery(BaseModel): query: QueryInput = Field( - ..., - description="Search query with input and optional collection name" - ) - use_rag: bool = Field( - False, - description="Include RAG context", - example=True + ..., description="Search query with input and optional collection name" ) + use_rag: bool = Field(False, description="Include RAG context", example=True) use_summary: bool = Field( - True, - description="Enable subquery summary", - example=False + True, description="Enable subquery summary", example=False ) @@ -101,13 +96,15 @@ def create_api(config_file: str): @app.post("/websearch") # query = query parameter def websearch(query: WebQuery): - #charge la pipeline directement depuis rag_pp - #changer le config_file avec le config file du rag --> ajouter ce que l'utilisateur demande + # charge la pipeline directement depuis rag_pp + # changer le config_file avec le config file du rag --> ajouter ce que l'utilisateur demande pipeline = WebsearchPipeline(config=config_file.websearch) if query.use_rag: logger.info("Launch RAG") - config_rag = load_config(config_file.websearch.rag_config_path, RAGInferenceConfig) + config_rag = load_config( + config_file.websearch.rag_config_path, RAGInferenceConfig + ) logger.info("Creating the RAG Pipeline...") rag_pp = RAGPipeline.from_config(config_rag.rag) data = rag_pp([query.query.dict()], return_dict=True) @@ -121,19 +118,20 @@ def websearch(query: WebQuery): answers = pipeline.run_api(query.use_rag, query.use_summary, data) logger.info("Websearch done") - return answers return app if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the Websearch (+ optional RAG) pipeline.") + parser = argparse.ArgumentParser( + description="Run the Websearch (+ optional RAG) pipeline." + ) parser.add_argument( "--config-file", type=str, required=True, - help="Path to the Websearch configuration file (YAML)." + help="Path to the Websearch configuration file (YAML).", ) args = parser.parse_args() run_websearch(args.config_file) diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index bc80e90f..4a60090e 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -28,7 +28,6 @@ class ProcessedResponse: sources: Dict[str, Any] # Maps URLs to lists of titles - class WebsearchPipeline: """ Pipeline for running RAG and iterative websearch loops, @@ -40,22 +39,24 @@ def __init__(self, config: WebsearchConfig): self.llm = self._initialize_llm() self.rag_results = None self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches) - self.search = DuckDuckGoSearchResults(api_wrapper=self.wrapper, output_format="list") - + self.search = DuckDuckGoSearchResults( + api_wrapper=self.wrapper, output_format="list" + ) def _initialize_llm(self) -> LLM: - if self.config.use_rag : + if self.config.use_rag: rag_cfg = self.config.access_rag_config() llm_conf = rag_cfg.get("rag", {}).get("llm") if llm_conf is None: - raise ValueError("Missing 'llm' config under 'rag' in RAG configuration.") + raise ValueError( + "Missing 'llm' config under 'rag' in RAG configuration." + ) return LLM.from_config(LLMConfig(**llm_conf)) - else : + else: base_conf = self.config.get_llm_config() base_conf = base_conf.__dict__ return LLM.from_config(LLMConfig(**base_conf)) - def generate_summary(self, rag_answer, query: str): """ Summarize the RAG answer (used when rag_summary=True) @@ -72,13 +73,17 @@ def generate_summary(self, rag_answer, query: str): ) messages = [ - SystemMessage(content="You are a helpful assistant that summarizes text relevant to the question."), + SystemMessage( + content="You are a helpful assistant that summarizes text relevant to the question." + ), HumanMessage(content=prompt), ] response = self.llm.invoke(messages) return self._clean_llm_output(response.content) - def evaluate_subquery_relevance(self, query, current_subqueries, previous_subqueries): + def evaluate_subquery_relevance( + self, query, current_subqueries, previous_subqueries + ): prompt = ( f"Original query:\n{query}\n\n" f"Previous subqueries that contribute to understanding:\n{previous_subqueries}\n\n" @@ -93,12 +98,11 @@ def evaluate_subquery_relevance(self, query, current_subqueries, previous_subque response = self.llm.invoke(messages) response = self._clean_llm_output(response.content) - if 'no' in response : + if "no" in response: return False - else : + else: return True - def _clean_llm_output(self, content: str): delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" @@ -110,11 +114,8 @@ def _clean_llm_output(self, content: str): return cleaned_section - def generate_subqueries( - self, - original_query: str, - current_context: Optional[str] = None + self, original_query: str, current_context: Optional[str] = None ) -> List[str]: """ Generate concise search subqueries @@ -134,10 +135,11 @@ def generate_subqueries( f"---ANSWER ---" ) - prompt = instruction + task messages = [ - SystemMessage(content="You are an assistant specializing in generating search queries."), + SystemMessage( + content="You are an assistant specializing in generating search queries." + ), HumanMessage(content=prompt), ] @@ -146,9 +148,6 @@ def generate_subqueries( cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) return cleaned_answer - - - def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: """ Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper @@ -164,28 +163,33 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: url = r.get("link", "") # note: it's "link" in LangChain results title = r.get("title", "") - formatted_results.append({"snippet": snippet, "url": url, "title" : title}) + formatted_results.append( + {"snippet": snippet, "url": url, "title": title} + ) return formatted_results except Exception as e: logger.error(f"DuckDuckGo search error: {e}") return [] - - def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[str]) -> Dict[str, str]: + def integrate_with_llm( + self, original: str, rag_doc: str, web_snippets: List[str] + ) -> Dict[str, str]: # Build prompt for short & detailed answer sources = "\n".join(web_snippets) prompt = ( - f"Original Query: {original}\n" - f"RAG Document Information:\n{rag_doc}\n\n" - f"Web Information:\n{sources}\n\n" - "Provide the response in the following format:\n" - "short answer: \n" - "detailed answer: " - ) - + f"Original Query: {original}\n" + f"RAG Document Information:\n{rag_doc}\n\n" + f"Web Information:\n{sources}\n\n" + "Provide the response in the following format:\n" + "short answer: \n" + "detailed answer: " + ) - msgs = [SystemMessage(content="You are a research assistant."), HumanMessage(content=prompt)] + msgs = [ + SystemMessage(content="You are a research assistant."), + HumanMessage(content=prompt), + ] resp = self.llm.invoke(msgs) # parse clean_content = self._clean_llm_output(resp.content) @@ -193,26 +197,23 @@ def integrate_with_llm( self, original: str, rag_doc: str, web_snippets: List[st sa_matches = re.findall( r"short answer:\s*(.*?)(?=detailed answer:)", clean_content, - flags=re.IGNORECASE|re.DOTALL + flags=re.IGNORECASE | re.DOTALL, ) da_matches = re.findall( - r"detailed answer:\s*(.*)", - clean_content, - flags=re.IGNORECASE|re.DOTALL + r"detailed answer:\s*(.*)", clean_content, flags=re.IGNORECASE | re.DOTALL ) short = sa_matches[-1].strip().rstrip(",") if sa_matches else "" detailed = da_matches[-1].strip() if da_matches else "" return {"short": short, "detailed": detailed} - - def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: qr = rec.get("input", "").strip() rag_ans = rec.get("answer", "") if self.config.use_rag else "" self.rag_results = rag_ans - rag_summary = self.generate_summary(rag_ans, qr) if self.config.use_rag else None - + rag_summary = ( + self.generate_summary(rag_ans, qr) if self.config.use_rag else None + ) source_map = {} current_context = rag_summary @@ -222,7 +223,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: web_summaries = [] previous_sub = [] - for loop in range(self.config.n_loops): if self.config.use_rag: subs = self.generate_subqueries(qr, current_context) @@ -232,13 +232,14 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: snippets = [] subquery_summaries = [] - - if loop > 0 and not self.evaluate_subquery_relevance(qr, subs, previous_sub): + if loop > 0 and not self.evaluate_subquery_relevance( + qr, subs, previous_sub + ): break for sq in subs: time.sleep(10) - res = self.duckduckgo_search(query = sq) + res = self.duckduckgo_search(query=sq) subquery_snippets = [] @@ -253,7 +254,6 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: snippets.append(snippet) subquery_snippets.append(snippet) - combined_snippets = "\n".join(subquery_snippets) summary = self.generate_summary(combined_snippets, sq) @@ -261,7 +261,9 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: previous_sub = subs - combined_sub_summaries = "\n".join([str(s) if s else "" for s in subquery_summaries]) + combined_sub_summaries = "\n".join( + [str(s) if s else "" for s in subquery_summaries] + ) web_summary = self.generate_summary(combined_sub_summaries, qr) web_summaries.append(web_summary) @@ -272,7 +274,9 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: # If not summarizing subqueries, use rag summary or current context with snippets context_for_llm = current_context - combined_web_summaries = "\n".join([str(s) if s else "" for s in web_summaries]) + combined_web_summaries = "\n".join( + [str(s) if s else "" for s in web_summaries] + ) web_summary_all = self.generate_summary(combined_web_summaries, qr) # Current context, web content to generate the answer @@ -282,20 +286,18 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: # Prepare context for next search loop current_context = final_detailed - solution = ProcessedResponse( - query=qr, - rag_informations=self.rag_results, - rag_summary=rag_summary if self.config.use_rag else None, - web_summary=web_summary_all, - short_answer=final_short, - detailed_answer=final_detailed, - sources=source_map + solution = ProcessedResponse( + query=qr, + rag_informations=self.rag_results, + rag_summary=rag_summary if self.config.use_rag else None, + web_summary=web_summary_all, + short_answer=final_short, + detailed_answer=final_detailed, + sources=source_map, ) return asdict(solution) - - def run(self): # RAG pipeline if self.config.use_rag: @@ -305,30 +307,25 @@ def run(self): rag(self.config.rag_config_path) rc = self.config.access_rag_config() self.config.input_file = rc["mode_args"]["output_file"] - with open(self.config.input_file, 'r', encoding='utf-8') as f: + with open(self.config.input_file, "r", encoding="utf-8") as f: data = json.load(f) else: self.config.input_file = self.config.input_queries data = [] - with open(self.config.input_file, 'r', encoding='utf-8') as f: + with open(self.config.input_file, "r", encoding="utf-8") as f: for line in f: data.append(json.loads(line.strip())) # JSONL format - outputs = [] outputs = [self.process_record(rec) for rec in data] - # save outp = Path(self.config.output_file) outp.parent.mkdir(exist_ok=True, parents=True) - with open(outp, 'w', encoding='utf-8') as f: + with open(outp, "w", encoding="utf-8") as f: json.dump(outputs, f, ensure_ascii=False, indent=2) logger.info(f"Results saved to {outp}") - - - def run_api(self, use_rag, use_summary, query): """ Process queries and handle them with a temporary JSONL file. @@ -348,8 +345,8 @@ def run_api(self, use_rag, use_summary, query): try: outputs = [] # Read from the temporary JSONL file - with open(temp_file_path, 'r', encoding='utf-8') as f: - if self.config.use_rag : + with open(temp_file_path, "r", encoding="utf-8") as f: + if self.config.use_rag: for line in f: record = json.loads(line) outputs.append(self.process_record(record)) @@ -365,15 +362,21 @@ def run_api(self, use_rag, use_summary, query): logger.info(f"Deleting temporary file: {temp_file_path}") os.remove(temp_file_path) - def _save_query_as_json(self, query): """Save query to a temporary JSONL file and return the file path.""" - suffix = '.json' if self.config.use_rag else '.jsonl' - with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False) as temp_file: + suffix = ".json" if self.config.use_rag else ".jsonl" + with tempfile.NamedTemporaryFile( + mode="w", suffix=suffix, delete=False + ) as temp_file: # Convert Pydantic models to dictionaries if needed if isinstance(query, list): - temp_file.writelines(json.dumps(q.dict() if hasattr(q, "dict") else q) + '\n' for q in query) + temp_file.writelines( + json.dumps(q.dict() if hasattr(q, "dict") else q) + "\n" + for q in query + ) else: - temp_file.write(json.dumps(query.dict() if hasattr(query, "dict") else query) + '\n') + temp_file.write( + json.dumps(query.dict() if hasattr(query, "dict") else query) + "\n" + ) logger.info(f"Query saved to temporary file: {temp_file.name}") return temp_file.name diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index 42bf9d23..7f244b93 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -9,9 +9,11 @@ class WebsearchOnly: """Class dedicated to performing web searches and validating their usefulness.""" - def __init__(self, region: str = 'wt-wt', max_results: int = 10): + def __init__(self, region: str = "wt-wt", max_results: int = 10): """Initialize the WebsearchOnly class with search parameters.""" - self.wrapper = DuckDuckGoSearchAPIWrapper(region=region, max_results=max_results) + self.wrapper = DuckDuckGoSearchAPIWrapper( + region=region, max_results=max_results + ) def websearch_pipeline(self, query: str) -> Dict[str, str]: """Perform a single web search.""" @@ -29,4 +31,3 @@ def resume_web_search(self, query: str, web_output: str) -> str: ) response = llm.invoke(prompt) return response.strip() - From 904cca80f2aaac3b57258c3c04d0bd10a8491d76 Mon Sep 17 00:00:00 2001 From: laetitia-wilhelm Date: Fri, 8 Aug 2025 20:17:12 -0700 Subject: [PATCH 22/33] formatting done --- src/mmore/run_rag.py | 7 ++++--- src/mmore/websearchRAG/config.py | 10 +++++++--- src/mmore/websearchRAG/logging_config.py | 6 ++++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/mmore/run_rag.py b/src/mmore/run_rag.py index ca113a47..1c08f3e8 100644 --- a/src/mmore/run_rag.py +++ b/src/mmore/run_rag.py @@ -66,14 +66,17 @@ class InnerInput(BaseModel): input: str collection_name: Optional[str] = None + class RAGInput(BaseModel): input: InnerInput + class RAGOutput(BaseModel): input: Optional[str] = None context: Optional[str] = None answer: Optional[str] = None + def create_api(rag: RAGPipeline, endpoint: str): app = FastAPI( title="RAG Pipeline API", @@ -117,9 +120,7 @@ def rag(config_file): uvicorn.run(app, host=config_args.host, port=config_args.port) else: - raise ValueError( - f"Unknown mode: {config.mode}. Should be either api or local" - ) + raise ValueError(f"Unknown mode: {config.mode}. Should be either api or local") if __name__ == "__main__": diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index a4e7d047..1af5f53d 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -36,11 +36,13 @@ class WebsearchConfig: n_subqueries: int = 3 n_loops: int = 2 max_searches: int = 10 - llm_config: Dict[str, Any] = field(default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200}) + llm_config: Dict[str, Any] = field( + default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200} + ) mode: Literal["local", "api"] = "local" def __post_init__(self): - required_fields = ["n_loops","n_subqueries", "max_searches", "mode"] + required_fields = ["n_loops", "n_subqueries", "max_searches", "mode"] for field_name in required_fields: if not getattr(self, field_name): raise ValueError(f"'{field_name}' is a required field.") @@ -65,7 +67,9 @@ def access_rag_config(self) -> Dict[str, Any]: rag_config_full_path = Path(self.rag_config_path) if not rag_config_full_path.exists(): - raise FileNotFoundError(f"RAG config file not found at {rag_config_full_path}") + raise FileNotFoundError( + f"RAG config file not found at {rag_config_full_path}" + ) # Load the RAG configuration with open(rag_config_full_path, "r") as file: diff --git a/src/mmore/websearchRAG/logging_config.py b/src/mmore/websearchRAG/logging_config.py index 7cdd44be..7c772ff9 100644 --- a/src/mmore/websearchRAG/logging_config.py +++ b/src/mmore/websearchRAG/logging_config.py @@ -13,11 +13,13 @@ # Prevent multiple handlers if the logger is configured multiple times if not logger.handlers: # Create a file handler to log to a file - file_handler = logging.FileHandler('shared_log_file.log') + file_handler = logging.FileHandler("shared_log_file.log") file_handler.setLevel(logging.DEBUG) # Define log format - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) file_handler.setFormatter(formatter) # Add file handler to logger From 8d5376e99590062c6dc71c16bd58b4b9fed1d556 Mon Sep 17 00:00:00 2001 From: fabnemEPFL Date: Tue, 12 Aug 2025 11:47:45 +0200 Subject: [PATCH 23/33] removed unused import --- src/mmore/cli.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mmore/cli.py b/src/mmore/cli.py index cfafc5ca..35550a2f 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -1,7 +1,6 @@ from typing import Optional import click -import yaml @click.group() @@ -244,7 +243,6 @@ def websearch(config_file): run_websearch(config_file) - @main.command() @click.option( "--host", From d6d802c0c553d55fb2534cfb6d89afd0407cd904 Mon Sep 17 00:00:00 2001 From: fabnemEPFL <117652591+fabnemEPFL@users.noreply.github.com> Date: Wed, 13 Aug 2025 15:25:37 +0200 Subject: [PATCH 24/33] Simplified loading of configuration --- src/mmore/run_websearch.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index fc907d56..95e2e37d 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -7,7 +7,6 @@ import torch import uvicorn -import yaml from dotenv import load_dotenv from fastapi import FastAPI from pydantic import BaseModel, Field @@ -39,10 +38,7 @@ def __post_init__(self): def run_websearch(config_file): - with open(config_file, "r") as f: - config_dict = yaml.safe_load(f) - - cfg = load_config(config_dict, WebsearchInferenceConfig) + cfg = load_config(config_file, WebsearchInferenceConfig) ws = cfg.websearch if ws.mode == "local": pipeline = WebsearchPipeline(config=ws) From 9800fe9510e396b1a639393f196fec7b90a2287f Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Thu, 14 Aug 2025 17:38:50 +0200 Subject: [PATCH 25/33] fixed some logic issues --- src/mmore/run_websearch.py | 4 ++-- src/mmore/websearchRAG/config.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 95e2e37d..78b44e06 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -58,8 +58,8 @@ def run_websearch(config_file): class QueryInput(BaseModel): input: str = Field(..., description="The user query") - collection_name: Optional[str] = Field( - None, description="The collection to search if use_rag set to True" + collection_name: str = Field( + "my_docs", description="The collection to search if use_rag set to True" ) diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index 1af5f53d..a102f65d 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -36,8 +36,8 @@ class WebsearchConfig: n_subqueries: int = 3 n_loops: int = 2 max_searches: int = 10 - llm_config: Dict[str, Any] = field( - default_factory=lambda: {"llm_name": "gpt-4", "max_new_tokens": 1200} + llm_config: LLMConfig = field( + default_factory=lambda: LLMConfig(**{"llm_name": "gpt-4", "max_new_tokens": 1200}) ) mode: Literal["local", "api"] = "local" @@ -49,9 +49,9 @@ def __post_init__(self): def get_llm_config(self) -> LLMConfig: """ - Convert the nested llm_config dict into an instance of rag.llm.LLMConfig. + Return the nested llm_config object. """ - return LLMConfig(**self.llm_config) + return self.llm_config def access_rag_config(self) -> Dict[str, Any]: """ From cfcf101184074fe91cd153b2cd7f42904ed23096 Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Fri, 15 Aug 2025 14:57:46 +0200 Subject: [PATCH 26/33] renamed a method in websearchRAG/websearch.py --- src/mmore/websearchRAG/websearch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index 7f244b93..e109671f 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -21,8 +21,8 @@ def websearch_pipeline(self, query: str) -> Dict[str, str]: web_output = search.run(query) return web_output - def resume_web_search(self, query: str, web_output: str) -> str: - """Call LLM to resume the current web output based on the original query, return a summary of the web search and the source.""" + def summarize_web_search(self, query: str, web_output: str) -> str: + """Call LLM to summarize the current web output based on the original query, return a summary of the web search and the source.""" llm = LLM() prompt = ( f"Original Query: '{query}'\n" From 2660792c90eea14ed9202c709b14d790ae4e8b91 Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Fri, 15 Aug 2025 14:58:25 +0200 Subject: [PATCH 27/33] fixed some typing / logic issues --- src/mmore/run_websearch.py | 14 +++---- src/mmore/websearchRAG/config.py | 6 ++- src/mmore/websearchRAG/pipeline.py | 58 +++++++++++++++++++++-------- src/mmore/websearchRAG/websearch.py | 9 +++-- 4 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py index 78b44e06..2ce90fa3 100644 --- a/src/mmore/run_websearch.py +++ b/src/mmore/run_websearch.py @@ -53,7 +53,7 @@ def run_websearch(config_file): uvicorn.run(app, host="0.0.0.0", port=8000) else: - raise ValueError(f"Unknown mode: {cfg.mode!r}. Must be 'local' or 'api'.") + raise ValueError(f"Unknown mode: {ws.mode!r}. Must be 'local' or 'api'.") class QueryInput(BaseModel): @@ -67,13 +67,13 @@ class WebQuery(BaseModel): query: QueryInput = Field( ..., description="Search query with input and optional collection name" ) - use_rag: bool = Field(False, description="Include RAG context", example=True) + use_rag: bool = Field(False, description="Include RAG context", examples=[True]) use_summary: bool = Field( - True, description="Enable subquery summary", example=False + True, description="Enable subquery summary", examples=[False] ) -def create_api(config_file: str): +def create_api(config: WebsearchInferenceConfig): app = FastAPI( title="mmore Websearch API", description="""This API is based on the OpenAPI 3.1 specification. You can find out more about Swagger at [https://swagger.io](https://swagger.io). @@ -92,14 +92,12 @@ def create_api(config_file: str): @app.post("/websearch") # query = query parameter def websearch(query: WebQuery): - # charge la pipeline directement depuis rag_pp - # changer le config_file avec le config file du rag --> ajouter ce que l'utilisateur demande - pipeline = WebsearchPipeline(config=config_file.websearch) + pipeline = WebsearchPipeline(config=config.websearch) if query.use_rag: logger.info("Launch RAG") config_rag = load_config( - config_file.websearch.rag_config_path, RAGInferenceConfig + config.websearch.rag_config_path, RAGInferenceConfig ) logger.info("Creating the RAG Pipeline...") rag_pp = RAGPipeline.from_config(config_rag.rag) diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py index a102f65d..f911ddde 100644 --- a/src/mmore/websearchRAG/config.py +++ b/src/mmore/websearchRAG/config.py @@ -28,16 +28,18 @@ class WebsearchConfig: """ rag_config_path: str # e.g., "../rag/config.yaml" + output_file: str use_rag: bool = False use_summary: bool = False input_file: Optional[str] = None input_queries: Optional[str] = None - output_file: Optional[str] = None n_subqueries: int = 3 n_loops: int = 2 max_searches: int = 10 llm_config: LLMConfig = field( - default_factory=lambda: LLMConfig(**{"llm_name": "gpt-4", "max_new_tokens": 1200}) + default_factory=lambda: LLMConfig( + **{"llm_name": "gpt-4", "max_new_tokens": 1200} + ) ) mode: Literal["local", "api"] = "local" diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py index 4a60090e..e960a055 100644 --- a/src/mmore/websearchRAG/pipeline.py +++ b/src/mmore/websearchRAG/pipeline.py @@ -9,6 +9,7 @@ from langchain_community.tools import DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper +from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage, SystemMessage from ..rag.llm import LLM, LLMConfig @@ -21,13 +22,29 @@ class ProcessedResponse: query: str rag_informations: str - rag_summary: str + rag_summary: str | None web_summary: str short_answer: str detailed_answer: str sources: Dict[str, Any] # Maps URLs to lists of titles +def extract_response(content: str | list[str | dict]) -> str: + response_content = content + if isinstance(response_content, str): + response = response_content + else: + response_tmp = response_content[-1] + response_tmp: str | dict[str, str] + + if isinstance(response_tmp, str): + response = response_tmp + else: + response = response_tmp.get("content", "") + + return response + + class WebsearchPipeline: """ Pipeline for running RAG and iterative websearch loops, @@ -43,10 +60,10 @@ def __init__(self, config: WebsearchConfig): api_wrapper=self.wrapper, output_format="list" ) - def _initialize_llm(self) -> LLM: + def _initialize_llm(self) -> BaseChatModel: if self.config.use_rag: rag_cfg = self.config.access_rag_config() - llm_conf = rag_cfg.get("rag", {}).get("llm") + llm_conf: Dict[str, Any] = rag_cfg.get("rag", {}).get("llm") if llm_conf is None: raise ValueError( "Missing 'llm' config under 'rag' in RAG configuration." @@ -57,7 +74,7 @@ def _initialize_llm(self) -> LLM: base_conf = base_conf.__dict__ return LLM.from_config(LLMConfig(**base_conf)) - def generate_summary(self, rag_answer, query: str): + def generate_summary(self, rag_answer: str | None, query: str): """ Summarize the RAG answer (used when rag_summary=True) """ @@ -65,7 +82,7 @@ def generate_summary(self, rag_answer, query: str): "You have only the following context to answer the question, do not use any external knowledge.\n\n" f"Question: {query}\n\n" "Context:\n" - f"{rag_answer}\n\n" + f"{rag_answer or 'No context yet'}\n\n" "If the context contains the answer or any useful information, respond with that information. \n" "If no useful informations are, answer: no useful informations\n" "Answer: \n" @@ -78,8 +95,11 @@ def generate_summary(self, rag_answer, query: str): ), HumanMessage(content=prompt), ] - response = self.llm.invoke(messages) - return self._clean_llm_output(response.content) + + response_llm = self.llm.invoke(messages) + response = extract_response(response_llm.content) + + return self._clean_llm_output(response) def evaluate_subquery_relevance( self, query, current_subqueries, previous_subqueries @@ -95,8 +115,9 @@ def evaluate_subquery_relevance( SystemMessage(content="You are a helpful assistant"), HumanMessage(content=prompt), ] - response = self.llm.invoke(messages) - response = self._clean_llm_output(response.content) + response_llm = self.llm.invoke(messages) + response_content = extract_response(response_llm.content) + response = self._clean_llm_output(response_content) if "no" in response: return False @@ -143,8 +164,9 @@ def generate_subqueries( HumanMessage(content=prompt), ] - response = self.llm.invoke(messages) - cleaned_answer = self._clean_llm_output(response.content) + response_llm = self.llm.invoke(messages) + response = extract_response(response_llm.content) + cleaned_answer = self._clean_llm_output(response) cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) return cleaned_answer @@ -173,7 +195,7 @@ def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: return [] def integrate_with_llm( - self, original: str, rag_doc: str, web_snippets: List[str] + self, original: str, rag_doc: str | None, web_snippets: List[str] ) -> Dict[str, str]: # Build prompt for short & detailed answer sources = "\n".join(web_snippets) @@ -190,9 +212,10 @@ def integrate_with_llm( SystemMessage(content="You are a research assistant."), HumanMessage(content=prompt), ] - resp = self.llm.invoke(msgs) + response_llm = self.llm.invoke(msgs) + response = extract_response(response_llm.content) # parse - clean_content = self._clean_llm_output(resp.content) + clean_content = self._clean_llm_output(response) sa_matches = re.findall( r"short answer:\s*(.*?)(?=detailed answer:)", @@ -219,6 +242,7 @@ def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: current_context = rag_summary final_short, final_detailed = "", "" web_summary = "" + web_summary_all = "" # will be reassigned later web_summaries = [] previous_sub = [] @@ -307,11 +331,15 @@ def run(self): rag(self.config.rag_config_path) rc = self.config.access_rag_config() self.config.input_file = rc["mode_args"]["output_file"] + + assert self.config.input_file with open(self.config.input_file, "r", encoding="utf-8") as f: data = json.load(f) else: self.config.input_file = self.config.input_queries data = [] + + assert self.config.input_file with open(self.config.input_file, "r", encoding="utf-8") as f: for line in f: data.append(json.loads(line.strip())) # JSONL format @@ -360,7 +388,7 @@ def run_api(self, use_rag, use_summary, query): finally: # Delete the temporary file logger.info(f"Deleting temporary file: {temp_file_path}") - os.remove(temp_file_path) + os.remove(temp_file_path) def _save_query_as_json(self, query): """Save query to a temporary JSONL file and return the file path.""" diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py index e109671f..e2b8c974 100644 --- a/src/mmore/websearchRAG/websearch.py +++ b/src/mmore/websearchRAG/websearch.py @@ -3,7 +3,7 @@ from langchain_community.tools import DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper -from ..rag.llm import LLM +from ..rag.llm import LLM, LLMConfig class WebsearchOnly: @@ -23,11 +23,14 @@ def websearch_pipeline(self, query: str) -> Dict[str, str]: def summarize_web_search(self, query: str, web_output: str) -> str: """Call LLM to summarize the current web output based on the original query, return a summary of the web search and the source.""" - llm = LLM() + llm = LLM.from_config( + LLMConfig(llm_name="OpenMeditron/meditron3-8b", max_new_tokens=1200) + ) prompt = ( f"Original Query: '{query}'\n" f"Web content: '{web_output}'\n" "Based on the original query and the web content, can you provide a response to the original query?" ) - response = llm.invoke(prompt) + response = llm.invoke(prompt).content + assert isinstance(response, str) return response.strip() From 0094a230cc1a4a5021b2ac7b1df712afafe85e57 Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Tue, 2 Sep 2025 11:40:47 +0200 Subject: [PATCH 28/33] updated slightly the documentation of websearch --- docs/websearch.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/websearch.md b/docs/websearch.md index 865adb54..eeda4fdf 100644 --- a/docs/websearch.md +++ b/docs/websearch.md @@ -89,8 +89,8 @@ The pipeline provides outputs in the following structure: - **Short Answer:** Concise response derived from combined RAG and WebSearch results. - **Detailed Answer:** Expanded response with context from both sources. - **Sources:** A list of URLs and their respective titles in the format: - ``` - URL: https://example.com, Title {Title 1; Title 2; ...} + ```json + {"URL": ["Title 1", "Title 2", ...]} ``` ### Example Output From f70d837d8fb3c581e3e50326c0692d7c2068073f Mon Sep 17 00:00:00 2001 From: fabnemEPFL Date: Thu, 4 Sep 2025 10:53:52 +0000 Subject: [PATCH 29/33] rotation of devices when loading several llms --- src/mmore/rag/llm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py index 8b622c0b..334d6cca 100644 --- a/src/mmore/rag/llm.py +++ b/src/mmore/rag/llm.py @@ -2,7 +2,8 @@ from dataclasses import dataclass # from getpass import getpass -from typing import Optional, cast +from typing import ClassVar, Optional, cast +import torch from langchain_anthropic import ChatAnthropic from langchain_cohere import ChatCohere @@ -107,6 +108,9 @@ def api_key(self): class LLM(BaseChatModel): """Class parsing the model name and arguments to load the correct LangChain model""" + device_count: ClassVar[int] = 0 + nb_devices: ClassVar[int] = torch.cuda.device_count() if torch.cuda.is_available() else 1 + @staticmethod def _check_key(org): if f"{org}_API_KEY" not in os.environ: @@ -122,11 +126,12 @@ def from_config(cls, config: str | LLMConfig) -> BaseChatModel: config = load_config(config, LLMConfig) if config.organization == "HF": + cls.device_count = (cls.device_count + 1) % (nb_devices + 1) # rotate devices, +1 for accounting the -1 below return ChatHuggingFace( llm=HuggingFacePipeline.from_model_id( config.llm_name, task="text-generation", - device_map="auto", + device=cls.device_count-1, pipeline_kwargs=config.generation_kwargs, ) ) From f1831787708140587dcba7fb9ffc9b6effaf0b5a Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Thu, 4 Sep 2025 13:00:47 +0200 Subject: [PATCH 30/33] removed useless newlines --- docs/websearch.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docs/websearch.md b/docs/websearch.md index eeda4fdf..c3df6f2c 100644 --- a/docs/websearch.md +++ b/docs/websearch.md @@ -1,8 +1,5 @@ # WebSearch Integration in RAG Pipeline - - - ## Implementation ### Overview @@ -19,7 +16,6 @@ Based on the implementation of the `RAG` module, the `Websearch` module enables You can customize various parts of the pipeline by defining [an inference Websearch configuration file](/examples/websearchRAG/config_api.yaml). - Users can adjust the pipeline according to their [requirements](/examples/websearchRAG/config.yaml) through the following parameters: - `use_rag`: Enables or disables RAG retrieval. @@ -27,8 +23,6 @@ Users can adjust the pipeline according to their [requirements](/examples/websea - `n_loops`: Defines the number of search iterations to refine results. - `n_subqueries`: Specifies the number of subqueries generated for each input query. - - ### Workflow 0. **RAG pipeline:** @@ -44,11 +38,6 @@ Users can adjust the pipeline according to their [requirements](/examples/websea 5. **Start again:** - We loop again from step 1 with the updated current knowledge - - - - - ## Minimal Example Here is a example to create a Websearch pipeline hosted through [LangServe](https://python.langchain.com/docs/langserve/) servers. @@ -80,7 +69,6 @@ Here is a example to create a Websearch pipeline hosted through [LangServe](http For both mode, if we want to use the RAG pipeline, it is necessay to provide the path to the rag configuration file. - ## Results and Outputs ### Output Format @@ -107,4 +95,3 @@ The pipeline provides outputs in the following structure: "https://example2.com" : ["Advancements in AI; AI Trends"], } } - From e72efaf6ec5e6efa404f1dfee74b5fbc1f4f9288 Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Thu, 4 Sep 2025 13:02:04 +0200 Subject: [PATCH 31/33] ruff formatting --- src/mmore/rag/llm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py index 334d6cca..679076ac 100644 --- a/src/mmore/rag/llm.py +++ b/src/mmore/rag/llm.py @@ -109,7 +109,9 @@ class LLM(BaseChatModel): """Class parsing the model name and arguments to load the correct LangChain model""" device_count: ClassVar[int] = 0 - nb_devices: ClassVar[int] = torch.cuda.device_count() if torch.cuda.is_available() else 1 + nb_devices: ClassVar[int] = ( + torch.cuda.device_count() if torch.cuda.is_available() else 1 + ) @staticmethod def _check_key(org): @@ -126,12 +128,14 @@ def from_config(cls, config: str | LLMConfig) -> BaseChatModel: config = load_config(config, LLMConfig) if config.organization == "HF": - cls.device_count = (cls.device_count + 1) % (nb_devices + 1) # rotate devices, +1 for accounting the -1 below + cls.device_count = (cls.device_count + 1) % ( + nb_devices + 1 + ) # rotate devices, +1 for accounting the -1 below return ChatHuggingFace( llm=HuggingFacePipeline.from_model_id( config.llm_name, task="text-generation", - device=cls.device_count-1, + device=cls.device_count - 1, pipeline_kwargs=config.generation_kwargs, ) ) From 1de7a576fad2be31f99ebce20f9f66c1c1ae0f97 Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Thu, 4 Sep 2025 13:04:11 +0200 Subject: [PATCH 32/33] bug fix --- src/mmore/rag/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py index 679076ac..ec9f874f 100644 --- a/src/mmore/rag/llm.py +++ b/src/mmore/rag/llm.py @@ -129,7 +129,7 @@ def from_config(cls, config: str | LLMConfig) -> BaseChatModel: if config.organization == "HF": cls.device_count = (cls.device_count + 1) % ( - nb_devices + 1 + cls.nb_devices + 1 ) # rotate devices, +1 for accounting the -1 below return ChatHuggingFace( llm=HuggingFacePipeline.from_model_id( From 2c7fcdec973ccc38fb83b6ddf569ba325c81f504 Mon Sep 17 00:00:00 2001 From: Fabrice Nemo Date: Thu, 4 Sep 2025 13:04:36 +0200 Subject: [PATCH 33/33] isort --- src/mmore/rag/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py index ec9f874f..3079777c 100644 --- a/src/mmore/rag/llm.py +++ b/src/mmore/rag/llm.py @@ -3,8 +3,8 @@ # from getpass import getpass from typing import ClassVar, Optional, cast -import torch +import torch from langchain_anthropic import ChatAnthropic from langchain_cohere import ChatCohere from langchain_core.language_models.chat_models import BaseChatModel