diff --git a/docs/websearch.md b/docs/websearch.md new file mode 100644 index 00000000..c3df6f2c --- /dev/null +++ b/docs/websearch.md @@ -0,0 +1,97 @@ +# WebSearch Integration in RAG Pipeline + +## Implementation + +### Overview +The WebSearch integration uses the DuckDuckGo search API through a wrapper provided by the LangChain library. The implementation combines: + +- **DuckDuckGo Search:** Retrieves external information through concise subqueries generated by a LLm +- **LLM Integration:** Summarizes and integrates retrieved web snippets with RAG results to provide a final answer + +## Customization + +Based on the implementation of the `RAG` module, the `Websearch` module enables the creation of a modular Websearch inference pipeline for your indexed multimodal documents, using two inference modes: + 1. **API**: Creates a server hosting the pipeline + 2. **Local**: Runs the inference locally + +You can customize various parts of the pipeline by defining [an inference Websearch configuration file](/examples/websearchRAG/config_api.yaml). + +Users can adjust the pipeline according to their [requirements](/examples/websearchRAG/config.yaml) through the following parameters: + +- `use_rag`: Enables or disables RAG retrieval. +- `use_summary`: Activates summarization of retrieved web snippets. +- `n_loops`: Defines the number of search iterations to refine results. +- `n_subqueries`: Specifies the number of subqueries generated for each input query. + +### Workflow + +0. **RAG pipeline:** + - Launch the RAG pipeline if `use_rag` is enabled and the set the output as being the current knowledge +1. **Input Query Processing:** + - The pipeline processes the user query and generates subqueries for web searches in order to complete the current knowledge. +2. **WebSearch Execution:** + - DuckDuckGo searches are performed for each subquery +3. **Summarization:** + - Retrieved web snippets are summarized using an LLM if `use_summary` is enabled. +4. **Integration with RAG (if use_rag):** + - WebSearch results are combined with RAG outputs to generate the current knowledge +5. **Start again:** + - We loop again from step 1 with the updated current knowledge + +## Minimal Example + +Here is a example to create a Websearch pipeline hosted through [LangServe](https://python.langchain.com/docs/langserve/) servers. + +1. Create your RAG Inference config file based on the [local example](/examples/websearch/config.yaml) or the [API example](/examples/websearch/config_api.yaml). + +2. Start your Websearch pipeline using the `run_websearch.py` script and your config file + ```bash + python3 -m mmore websearch --config_file /path/to/config.yaml + ``` + +3. In API mode, query the server like any other LangServe server: + ```bash + curl --location --request POST http://localhost:8000/websearch/ \ + -H 'Content-Type: application/json' \ + -d ' { + "query": { + "input": "When was Barack Obama born?", + "collection_name": "my_docs" + }, + "use_rag": true, + "use_summary": true + } + }' + ``` + In the API mode, it is necessary to provide the `use_rag` and `use_summary`parameters in the query, the number of `n_loops` and `n_subqueries` are defined in the config file. + + In local mode, the pipeline is run directly with the input data specified in the configuration file and the result is saved at the specified path. + + For both mode, if we want to use the RAG pipeline, it is necessay to provide the path to the rag configuration file. + +## Results and Outputs + +### Output Format +The pipeline provides outputs in the following structure: + +- **Short Answer:** Concise response derived from combined RAG and WebSearch results. +- **Detailed Answer:** Expanded response with context from both sources. +- **Sources:** A list of URLs and their respective titles in the format: + ```json + {"URL": ["Title 1", "Title 2", ...]} + ``` + +### Example Output +```json +{ + "query": "What are the latest advancements in AI?", + "rag_informations": "Pre-existing knowledge from RAG.", + "rag_summary": "Summary of RAG knowledge.", + "web_summary": "Summarized web search information.", + "short_answer": "AI advancements include new models and real-time applications.", + "detailed_answer": "Recent AI developments include breakthroughs in large language models and innovative real-time applications, supported by diverse resources from RAG and web search.", + "sources": { + "https://example1.com" : ["AI Breakthrough; Latest in AI"], + "https://example2.com" : ["Advancements in AI; AI Trends"], + } +} diff --git a/examples/websearchRAG/config.yaml b/examples/websearchRAG/config.yaml new file mode 100644 index 00000000..dfd34bc0 --- /dev/null +++ b/examples/websearchRAG/config.yaml @@ -0,0 +1,14 @@ +websearch: + use_rag: true + rag_config_path: examples/rag/config.yaml + use_summary: true + n_subqueries : 2 + input_file: examples/rag/output.json + input_queries: examples/rag/queries.jsonl + output_file: examples/websearchRAG/enhanced_results_trial.json + n_loops: 2 + max_searches: 5 + mode: local + llm_config: + llm_name: OpenMeditron/meditron3-8b + max_new_tokens: 1200 diff --git a/examples/websearchRAG/config_api.yaml b/examples/websearchRAG/config_api.yaml new file mode 100644 index 00000000..03cb2469 --- /dev/null +++ b/examples/websearchRAG/config_api.yaml @@ -0,0 +1,15 @@ +websearch: + rag_config_path: examples/rag/config_api.yaml + n_subqueries : 3 + n_loops: 2 + max_searches: 10 + mode: api + llm_config: + llm_name: OpenMeditron/meditron3-8b + max_new_tokens: 1200 + +# Mode Config +mode_args: + endpoint: '/rag' + port: 8000 + host: 'localhost' diff --git a/examples/websearchRAG/enhanced_results_trial.json b/examples/websearchRAG/enhanced_results_trial.json new file mode 100644 index 00000000..9c42305f --- /dev/null +++ b/examples/websearchRAG/enhanced_results_trial.json @@ -0,0 +1,234 @@ +[ + { + "query": "When was Barack Obama born?", + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. In *Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition*, pages 9556–9567, 2024.\n- [61] Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu\n\nJiang, Weiming Ren, Yuxuan Sun, Cong Wei, Botao Yu, Ruibin Yuan, Renliang Sun, Ming Yin, Boyuan Zheng, Zhenzhu Yang, Yibo Liu, Wenhao Huang, Huan Sun, Yu Su, and Wenhu Chen. Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. *arXiv preprint arXiv:2311.16502*, 2023.\n\n- [62] Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, and Lucas Beyer. Sigmoid loss for language image pre-training. In *Proceedings of the IEEE/CVF International Conference on Computer Vision*, pages 11975–11986, 2023.\n- [63] Ruohong Zhang, Liangke Gui, Zhiqing Sun, Yihao Feng, Keyang Xu, Yuanhan Zhang, Di Fu, Chunyuan Li, Alexander Hauptmann, Yonatan Bisk, et al. Direct preference optimization of video large multimodal models from language model reward. *arXiv preprint arXiv:2404.01258*, 2024.\n- [64] Renrui Zhang, Jiaming Han, Aojun Zhou, Xiangfei Hu, Shilin Yan, Pan Lu, Hongsheng Li, Peng Gao, and Yu Qiao. LLaMA-adapter: Efficient fine-tuning of large language models with zero-initialized attention. In *The Twelfth International Conference on Learning Representations*, 2024.\n\n\n[2] - Skills: Emphasized on emotional intelligence, interpersonal skills, and creativity. - Honors & Awards: Notable achievements include the Golden Globe Award for Best Television Series and the Golden Globe Award for Best Television Special. 2. Steve Jobs: - Professional Experience: Co-founder of Apple Inc., where he led the development of iconic products like the iMac, iPod, and iPhone. - Skills: Highlighted as a thinker with a focus on innovation and problem-solving. - Honors & Awards: Recognized with numerous accolades, including the MacArthur Fellowship and the NeXT Computer Science Award. Please note that these descriptions are based on the information provided in the images and do not include any personal details or specific accomplishments mentioned in the images.\n\nUser Which of them better suits the role of the Congressman of the US? Why?\n\nOurs The first resume, featuring Oprah Winfrey, would better suit the role of a Congressman because it highlights skills and experiences that are relevant to political office, such as leadership, public speaking, and media experience. These qualities are essential for a politician to effectively represent their constituents and engage in legislative processes. The second resume, while it could be suitable for a creative or entrepreneurial role, does not specifically mention skills or experiences that align with the responsibilities of a Congressman.\n\nTable 12. An example of emerging capability: multi-doc VQA.\n\n## References\n\n- [1] Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, et al. Flamingo: a visual language model for few-shot learning. *arXiv preprint*, 2022.\n- [2] Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, et al. Openflamingo: An open-source framework for training large autoregressive vision-language models. *arXiv preprint arXiv:2308.01390*, 2023.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhen was Barack Obama born?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBarack Obama was born on August 4, 1961.", + "rag_summary": "barack obama was born on august 4, 1961.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nthe provided context does not contain any information about barack obama's birthdate.", + "web_summary": "no useful information.", + "short_answer": "barack obama was born on august 4, 1961.", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: barack obama was born on august 4, 1961.\n\ndetailed answer: barack obama was born on august 4, 1961, in honolulu, hawaii, to barack obama, sr., and stanley ann dunham.", + "sources": { + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://www.geeksforgeeks.org/sql/how-to-use-a-subquery-in-a-select-statement/": [ + "How to Use a Subquery in a SELECT Statement - GeeksforGeeks" + ], + "https://www.datacamp.com/tutorial/sql-subquery": [ + "SQL Subquery: A Comprehensive Guide - DataCamp" + ], + "https://www.kdnuggets.com/beginners-guide-subqueries-sql": [ + "Beginner's Guide to Subqueries in SQL - KDnuggets" + ], + "https://www.britannica.com/biography/Barack-Obama": [ + "Barack Obama | Biography, Parents, Education, Presidency, Books ..." + ], + "https://kvia.com/politics/cnn-us-politics/2025/02/05/barack-obama-fast-facts-2/": [ + "Barack Obama Fast Facts - KVIA" + ], + "https://chroniclepaths.com/public-figures/biography-of-barack-obama": [ + "Biography of Barack Obama - chroniclepaths.com" + ], + "https://www.havefunwithhistory.com/barack-obama-timeline/": [ + "Barack Obama Timeline - Have Fun With History" + ], + "https://www.politifact.com/factchecks/2025/jan/13/threads-posts/no-this-isnt-proof-former-president-barack-obama-w/": [ + "Image shows former President Barack Obama was born in Kenya." + ], + "https://ohmyfacts.com/celebrities/45-facts-about-barack-obama/": [ + "45 Facts About Barack Obama - OhMyFacts" + ], + "https://www.geeksforgeeks.org/nested-queries-in-sql/": [ + "Nested Queries in SQL - GeeksforGeeks" + ], + "https://learn.microsoft.com/en-us/sql/relational-databases/performance/subqueries?view=sql-server-ver17": [ + "Subqueries (SQL Server) - SQL Server | Microsoft Learn" + ], + "https://factcheckhub.com/false-claim-about-obamas-birthplace-circulates-online/": [ + "False claim about Obama's birthplace circulates online" + ], + "https://www.havefunwithhistory.com/facts-about-barack-obama/": [ + "10 Facts About Barack Obama - Have Fun With History" + ], + "https://slcl.libguides.com/c.php?g=1426394&p=10653829": [ + "The U.S. Government for Kids and Teens: Barack Obama" + ], + "https://en.wikipedia.org/wiki/Early_life_and_career_of_Barack_Obama": [ + "Early life and career of Barack Obama - Wikipedia" + ], + "https://www.snopes.com/fact-check/birthing-pains/": [ + "Kenyan Government Releases Obama's Real Birth Certificate" + ], + "https://www.obamalibrary.gov/obamas/president-barack-obama": [ + "President Barack Obama | Barack Obama Presidential Library" + ] + } + }, + { + "query": "Who founded Google?", + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] - Skills: Emphasized on emotional intelligence, interpersonal skills, and creativity. - Honors & Awards: Notable achievements include the Golden Globe Award for Best Television Series and the Golden Globe Award for Best Television Special. 2. Steve Jobs: - Professional Experience: Co-founder of Apple Inc., where he led the development of iconic products like the iMac, iPod, and iPhone. - Skills: Highlighted as a thinker with a focus on innovation and problem-solving. - Honors & Awards: Recognized with numerous accolades, including the MacArthur Fellowship and the NeXT Computer Science Award. Please note that these descriptions are based on the information provided in the images and do not include any personal details or specific accomplishments mentioned in the images.\n\nUser Which of them better suits the role of the Congressman of the US? Why?\n\nOurs The first resume, featuring Oprah Winfrey, would better suit the role of a Congressman because it highlights skills and experiences that are relevant to political office, such as leadership, public speaking, and media experience. These qualities are essential for a politician to effectively represent their constituents and engage in legislative processes. The second resume, while it could be suitable for a creative or entrepreneurial role, does not specifically mention skills or experiences that align with the responsibilities of a Congressman.\n\nTable 12. An example of emerging capability: multi-doc VQA.\n\n## References\n\n- [1] Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, et al. Flamingo: a visual language model for few-shot learning. *arXiv preprint*, 2022.\n- [2] Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, et al. Openflamingo: An open-source framework for training large autoregressive vision-language models. *arXiv preprint arXiv:2308.01390*, 2023.\n\n\n[2] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[3] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[4] 4%)
39 | (23.3%)
17 | | 73 |\n| [44]
al.
et
Zhang | China
Jinhua, | days
1–16 | 14 | 14 | (35.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho founded Google?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGoogle was founded by Larry Page and Sergey Brin in 1998.", + "rag_summary": "google was founded by larry page and sergey brin in 1998.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ngoogle was founded by larry page and sergey brin in 1998.", + "web_summary": "no useful information is provided in the context to answer the question about who founded google.", + "short_answer": "google was founded by larry page and sergey brin in 1998.", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: google was founded by larry page and sergey brin in 1998.\n\ndetailed answer: google was founded on september 4, 1998, by american computer scientists larry page and sergey brin. since then, google has become the go-to search engine for internet users across the globe, from students researching for their term papers to businesses looking to attract more customers. the google search algorithm works by considering a number of factors and signals, such as the search query, content relevance, page useability, user location and source trustworthiness. the factors are then analyzed by the google ranking algorithm formula, which produces the rankings of pages in the google search results.", + "sources": { + "https://www.datacamp.com/tutorial/sql-subquery": [ + "SQL Subquery: A Comprehensive Guide - DataCamp" + ], + "https://www.geeksforgeeks.org/sql/sql-subquery/": [ + "SQL | Subquery - GeeksforGeeks" + ], + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://dev.to/abhay_yt_52a8e72b213be229/understanding-sql-subqueries-a-complete-guide-with-examples-3jff": [ + "Understanding SQL Subqueries: A Complete Guide with Examples" + ], + "https://en.wikipedia.org/wiki/Larry_Page": [ + "Larry Page - Wikipedia" + ], + "https://businesssharksmagazine.com/larry-page-and-sergey-brins-google-success/": [ + "Larry Page and Sergey Brin's Google Success Story" + ], + "https://press.farm/the-story-behind-google-search-engine/": [ + "The Story Behind Google - Pressfarm" + ], + "https://quartr.com/insights/business-philosophy/larry-page-the-mastermind-behind-google": [ + "Larry Page: The Mastermind Behind Google - Quartr" + ], + "https://www.britannica.com/money/Google-Inc": [ + "Google | History & Facts; Products & Services | Britannica Money" + ], + "https://www.jagranjosh.com/us/explainers/who-is-the-owner-of-google-1860000139": [ + "Who is the Owner of Google? - Jagran Josh" + ], + "https://interestingengineering.com/innovation/who-invented-google-and-how-did-it-happen": [ + "Who Invented Google and How did it Happen? - Interesting Engineering" + ], + "https://www.link-assistant.com/news/how-old-is-google.html": [ + "How Old Is Google? The Story Behind the Search Engine" + ], + "https://www.geeksforgeeks.org/sql/how-to-use-a-subquery-in-a-select-statement/": [ + "How to Use a Subquery in a SELECT Statement - GeeksforGeeks" + ], + "https://www.kdnuggets.com/beginners-guide-subqueries-sql": [ + "Beginner's Guide to Subqueries in SQL - KDnuggets" + ], + "https://datascienceexplain.com/sql-databases/mastering-sql-subqueries-a-complete-guide-to-writing-efficient-nested-queries/": [ + "Mastering SQL Subqueries: A Complete Guide to Writing Efficient Nested ..." + ], + "https://www.semrush.com/blog/google-search-algorithm/": [ + "How the Google Search Algorithm Works: A Zero-Fluff Guide - Semrush" + ], + "https://www.theedgesearch.com/2024/10/google-algorithms.html": [ + "Google algorithms explained: Everything you need to know" + ], + "https://www.searchenginejournal.com/google-algorithm-history/": [ + "Google Algorithm Updates & Changes: A Complete History" + ], + "https://www.searchlogistics.com/glossary/google-algorithm/": [ + "What Is The Google Algorithm? - SearchLogistics" + ] + } + }, + { + "query": "Where is the Eiffel Tower located?", + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] |---------------------|----------------------|------|------------------------|-------------------------|------------------|---------|--|\n| In-the-front | Interleaved | 52.9 | 36.8 | 30.5 | 70.1 | 74.0 | |\n| | In-the-front | 54.3 | 36.6 | 32.8 | 74.7 | 75.3 | |\n| Interleaved | Interleaved | 55.4 | 37.8 | 32.9 | 76.2 | 76.0 | |\n| | In-the-front | 52.4 | 36.1 | 29.0 | 72.9 | 71.8 | |\n| Mixed | Interleaved | 57.0 | 38.\n\n[2] |---------------------|----------------------|------|------------------------|-------------------------|------------------|---------|--|\n| In-the-front | Interleaved | 52.9 | 36.8 | 30.5 | 70.1 | 74.0 | |\n| | In-the-front | 54.3 | 36.6 | 32.8 | 74.7 | 75.3 | |\n| Interleaved | Interleaved | 55.4 | 37.8 | 32.9 | 76.2 | 76.0 | |\n| | In-the-front | 52.4 | 36.1 | 29.0 | 72.9 | 71.8 | |\n| Mixed | Interleaved | 57.0 | 38.\n\n[3] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[4] | Jinhua,
[44]
al.
et
Zhang | China | days
1–16 | 14 | 14 | (35.7%)
5 | | | 14 |\n| Shanghai,
[45]
al.\n\n[5] 7%)
5 | | | 14 |\n| [45]
al.
et
Ling | China
Shanghai, | days
22 | 66 | 66 | 66 | (88.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhere is the Eiffel Tower located?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe Eiffel Tower is located in Paris, France.", + "rag_summary": "the eiffel tower is located in paris, france.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nno useful information", + "web_summary": "no useful information.", + "short_answer": "the eiffel tower is located in paris, france.", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: the eiffel tower is located in paris, france.\ndetailed answer: the eiffel tower is a wrought-iron structure in paris, france that is one of the most famous landmarks in the world. it is also a technological masterpiece in building-construction history. it was designed and built (1887-89) by gustave eiffel and named in his honor. the tower drew almost two million visitors during the world's fair, proving its immediate popularity despite earlier opposition. while initially intended as a temporary structure, to be dismantled after 20 years, eiffel had the foresight to propose scientific uses for the tower, which ensured its survival. today, it remains one of the tallest and most recognizable structures in the world, drawing millions of visitors annually, with viewing platforms offering stunning panoramic views of paris.", + "sources": { + "https://learnsql.com/blog/sql-subquery-practice/": [ + "SQL Subquery Practice: 15 Exercises with Solutions" + ], + "https://www.dataquest.io/blog/sql-subqueries-for-beginners/": [ + "SQL Subqueries: A Beginner's Guide (with Code Examples) - Dataquest" + ], + "https://w3schools.tech/tutorial/sql/sql-sub-queries": [ + "Sub Queries - Advanced SQL - W3schools" + ], + "https://www.geeksforgeeks.org/sql/sql-subquery/": [ + "SQL | Subquery - GeeksforGeeks" + ], + "https://www.britannica.com/topic/Eiffel-Tower-Paris-France": [ + "Eiffel Tower | History, Height, & Facts | Britannica" + ], + "https://heightandsize.com/eiffel-tower-height/": [ + "How Tall is the Eiffel Tower? - heightandsize.com" + ], + "https://www.architecturaldigest.com/story/eiffel-tower-everything-you-need-to-know": [ + "Eiffel Tower: Everything You Need to Know - Architectural Digest" + ], + "https://gosta.media/en/society/tourism/what-height-eiffel-tower-15-interesting-facts-you-may-not-know/": [ + "15 Interesting Facts About The Eiffel Tower That You May Not Know" + ], + "https://travelpander.com/does-the-eiffel-tower-have-a-purpose/": [ + "Eiffel Tower: Purpose, History, Facts, and Practical Uses Explained" + ], + "https://www.travelandleisure.com/attractions/landmarks-monuments/eiffel-tower-facts": [ + "16 Eiffel Tower Facts: History, Science, and Secrets - Travel" + ], + "https://1worldplace.com/what-is-the-purpose-of-the-eiffel-tower/": [ + "What is the Purpose of the Eiffel Tower: Discover remarkable" + ], + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://www.dannydutch.com/post/the-eiffel-tower-from-controversy-to-icon-the-opening-that-changed-paris-forever": [ + "The Eiffel Tower: From Controversy to Icon - The Opening That Changed ..." + ], + "https://worldofhistorycheatsheet.com/history-of-the-eiffel-tower/": [ + "History of the Eiffel Tower | World of History" + ], + "https://worldhistoryjournal.com/2025/01/11/the-eiffel-tower-engineering-marvel-and-cultural-landmark/": [ + "The History of the Eiffel Tower: an Engineering and Cultural Landmark" + ] + } + }, + { + "query": "When will the artificial intelligence conference be held?", + "rag_informations": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUse the following context to answer the questions.\n\nContext:\n[1] Q-bench: A benchmark for general-purpose foundation models on low-level vision. *arXiv preprint arXiv:2309.14181*, 2023.\n- [57] Junbin Xiao, Xindi Shang, Angela Yao, and Tat-Seng Chua. Next-qa: Next phase of question-answering to explaining temporal actions. In *Proceedings of the IEEE/CVF conference on computer vision and pattern recognition*, pages 9777–9786, 2021.\n- [58] Runsen Xu, Xiaolong Wang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. Pointllm: Empowering large language models to understand point clouds. *arXiv preprint arXiv:2308.16911*, 2023.\n- [59] Zhou Yu, Dejing Xu, Jun Yu, Ting Yu, Zhou Zhao, Yueting Zhuang, and Dacheng Tao. Activitynet-qa: A dataset for understanding complex web videos via question answering. In *Proceedings of the AAAI Conference on Artificial Intelligence*, volume 33, pages 9127–9134, 2019.\n- [60] Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu Jiang, Weiming Ren, Yuxuan Sun, et al. Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. In *Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition*, pages 9556–9567, 2024.\n- [61] Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu\n\n\n\n[2] Applied Data Analysis (CS401)\n\nLecture 1\nIntro to ADA\n11 Sep 2024\nMaria Brbić / Robert West\n‹#›\nImportant websites\n\nhttp://ada.epfl.ch\nYour main entry point. All materials linked from there.\nhttps://go.epfl.ch/ada2024-ed\nMain communication channel. Sign in with your EPFL email address (or simply access via Moodle).\n\nhttps://github.com/epfl-ada/2024\nUsed for exercises, homework, project, and final exam.\n‹#›\nPrevious instructor\nRobert West\nOn sabbatical in Microsoft Research, Seattle\n\n‹#›\nAbout your instructor this year\n~1500 people\n~ 3.850.000 people\n\n\n\nBorn in Tučepi, Croatia\nEducation:\nUniversity of Zagreb, Croatia\nUniversity of Tokyo\nStanford University, USA\nAssistant Professor at EPFL since Sep '22\nMachine Learning for Biomedicine (MLBio) lab\n‹#›\nOur research @ MLBio\nMethods\nMy Research\nBiomedicine\nMLBio\nLab\nMachine\nLearning\nChallenges\n‹#›\nOur research @ MLBio\nDevelop new AI methods\nUnsupervised learning, generative models, open-world learning etc\nCollaborate with biologists and medical researchers and have new \"untouched datasets\" or collect datasets\nGain new insights from these datasets 🡪 what are interesting questions that need new AI algorithms to be answered?\nApply AI algorithms we develop to advance biomedical research and drive new discoveries in biology and medicine\n‹#›\nOur research @ MLBio\n‹#›\nData analysis\n\n\"... the process of inspecting, cleaning, transforming, and modeling data with the goal of discovering useful information, suggesting conclusions, and supporting decision-making.\"\n\n\n[3] Applied Data Analysis (CS401)\n\nLecture 1\nIntro to ADA\n11 Sep 2024\nMaria Brbić / Robert West\n‹#›\nImportant websites\n\nhttp://ada.epfl.ch\nYour main entry point. All materials linked from there.\nhttps://go.epfl.ch/ada2024-ed\nMain communication channel. Sign in with your EPFL email address (or simply access via Moodle).\n\nhttps://github.com/epfl-ada/2024\nUsed for exercises, homework, project, and final exam.\n‹#›\nPrevious instructor\nRobert West\nOn sabbatical in Microsoft Research, Seattle\n\n‹#›\nAbout your instructor this year\n~1500 people\n~ 3.850.000 people\n\n\n\nBorn in Tučepi, Croatia\nEducation:\nUniversity of Zagreb, Croatia\nUniversity of Tokyo\nStanford University, USA\nAssistant Professor at EPFL since Sep '22\nMachine Learning for Biomedicine (MLBio) lab\n‹#›\nOur research @ MLBio\nMethods\nMy Research\nBiomedicine\nMLBio\nLab\nMachine\nLearning\nChallenges\n‹#›\nOur research @ MLBio\nDevelop new AI methods\nUnsupervised learning, generative models, open-world learning etc\nCollaborate with biologists and medical researchers and have new \"untouched datasets\" or collect datasets\nGain new insights from these datasets 🡪 what are interesting questions that need new AI algorithms to be answered?\nApply AI algorithms we develop to advance biomedical research and drive new discoveries in biology and medicine\n‹#›\nOur research @ MLBio\n‹#›\nData analysis\n\n\"... the process of inspecting, cleaning, transforming, and modeling data with the goal of discovering useful information, suggesting conclusions, and supporting decision-making.\"\n\"Data analysis has multiple facets and approaches, encompassing diverse techniques under a variety of names, in different business, science, and social science domains.\"\n‹#›\nApplied data analysis\nThis course is about breadth, not depth\n\"What methods, principles, and tools are out there?\", rather than \"How can I become an expert in deep learning for computer vision applied to images of cats?\"\nData science is a fast-paced, shifting field\nObsessing on one tool or technique won't pay off in a few years\nBe ready to explore and keep learning on your own\nComplementary courses:\nMachine learning\nNLP\nDIS\nData viz\nGoal of this class: Enable you to conduct a\nfull-fledged data science project from start to finish\nThat being said, depth matters too…\n‹#›\nLet's abbreviate this course as Ada, not A-D-A, in honor of Ada Lovelace, \"the world's first computer programmer.\"\nhttps://en.wikipedia.org/wiki/Ada_Lovelace\n\n‹#›\n\n‹#›\nSyllabus\nHandling data\n\"Slicing and dicing\": obtaining, preparing, juggling data\nVisualizing data\nExploration of data, communication of results\nDescribing data\nHow to support (and be suspicious of) claims about data\nRegression analysis for disentangling data\nHow to disentangle datasets with correlated variables\nCausal analysis of observational data\nHow to deal with \"found data\"\nCorrelation != causation\n‹#›\nSyllabus (cont'd)\nLearning from data\nSupervised learning\nUnsupervised learning\nApplied aspects of machine learning\nHandling specific types of data\nHandling text data\nHandling network data\nScaling to massive data\n‹#›\nGrading\n35% Homework assignments (2)\nInvolving skills required from data scientists\nGroups of 5 students\nHomework of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n35% Final exam\nMini data analysis project\nDone on laptop, individually, on campus\nFinal exams of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n30% Project (more details soon)\nYour own freestyle data analysis\nDone in groups of 5 students (same as for homework)\nMilestones spread throughout the semester\nProjects of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n‹#›\nGrading\n35% Homework assignments (2)\nInvolving skills required from data scientists\nGroups of 5 students\nHomework of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n35% Final exam (date TBD)\nMini data analysis project\nDone on laptop, individually, on campus\nFinal exams of 2017, 2018, 2019, 2020, 2021, 2022, 2023\n30% Project (more details soon)\nYour own freestyle data analysis\nDone in groups of 5 students (same as for homework)\nMilestones spread throughout the semester\nProjects of 2017, 2018, 2019, 2020, 2021, 2022, 2023\nThis class will be hard work,\nbut it will get you a job.\n‹#›\nGrading (cont'd)\nTo obtain a meaningful grade distribution, scaling/shifting will be applied to each of {homework, project, exam, quizzes} before taking weighted average (standard practice at EPFL)\nWhile intermediate grades are a good indication of where you stand, remember there might be some wiggle\n→ Don't rely on intermediate grades to decide whether you can afford to skip the exam etc.\n‹#›\nDeadlines\nHomeworks\nHomework 1\nRelease Oct 4th 2024\nDue Oct 18th 2024\nHomework 2\nRelease Nov 15th 2024\nDue Nov 29th 2024\nFinal exam\nDate TBD\nProject deliverables\nProject milestone P1\nDue Oct 4th 2024\nProject milestone P2\nDue Nov 15th 2024\nProject milestone P3\nDue Dec 20th 2024\nAll deadlines are 23:59 CET\n‹#›\nMeeting logistics: Lectures\nWednesdays 8:15–10:00\nIf you want to see it live, come to class! (No live streaming)\nLectures are also recorded and made available after class\n‹#›\nMeeting logistics: Lab sessions\nFridays 13:15–14:45\nIn person only:\nCO 1\nCM 1 120\nCM 1 121\nLabs are complementary to lectures, not simply more detail on same\nYou solve exercises that we make available the day before, can ask questions and get help from assistants\nIn certain weeks: homework/project office hours (probably on Zoom, in parallel to exercises)\n‹#›\nWeekly quizzes\nAvailable online on Moodle after every lecture\n5 questions, to be answered within 10 minutes of starting\nQuiz 1: no real questions, just to let you get familiar with the setup\nQuiz 2: the first quiz with lecture material questions\nQuiz i is about lecture material of week i\nGoal:\nEngage continuously with course material\nThink (not just find right slide)\nNot graded, for you to recap lecture materials\n‹#›\nProject\nWe'll provide a number of datasets\nYou need to form and pitch a crisp project idea\nFree to combine with other datasets (at your own risk)\nGoal: not a loose collection of results – tell a story with the data!\nData stories of 2017, 2018, 2019, 2020, 2021, 2022, 2023\nNice example data story\n‹#›\n\nHomework and projects: GitHub\nDe-facto standard for managing and sharing code\nAll students in this class need a GitHub account\nHomework and project submissions done via GitHub\nADA Github repository:\nhttps://github.com/epfl-ada/2024\n‹#›\nMain communication channel:\nClass forum, available via Moodle\nAlso accessible directly, outside of Moodle:https://go.epfl.ch/ada2024-ed (sign in using the same email address as for Moodle)\nCentral place to ask all class-related questions\nDon't send us email\nMandatory! We'll send important announcements on Ed only\nHelp each other (without cheating, of course)\n‹#›\nWatch-at-home videos\nThroughout the semester, we'll release videos with supplemental information; e.g.,\nIntro to lab sessions (already available!)\nProject instructions\nHomework 1 postmortem\nHomework 2 postmortem\n‹#›\nGeneral note on communication\nMultiple platforms used in ADA for various tasks (as in real life): Ed, GitHub, Google docs, ADA website\nTo avoid confusion,\nfamiliarize yourself with communication guidelines\nall materials will be linked from the website as a central point of entry: https://ada.epfl.ch\nall discussions will take place on Ed\n‹#›\nCommercial break\nADA students: sharp like teeth!\n‹#›\nGroup registration\nMust form teams within 2 weeks, starting now (in time for release of Homework 1)!\nGet started immediately to find 4 other teammates\nBy Fri 27 Sep 23:59, complete the registration form (to be done by each team member individually):https://go.epfl.ch/ada2024-team-registration\nCan change team after Homework 1 (but try to avoid it)\n‹#›\nPrerequisites\nBasics of\nprobabilities and stats\ndatabases\nprogramming\nYou won't survive if you can't program\nHomework, exam: Python required\nProject: up to you, but we support only Python\nBrush up your Python skills (many great online courses out there)\n‹#›\nPython environments\nHomeworks and exams to be done as Jupyter Notebooks\nYou will submit a pre-executed .ipynb file\nWe don't care how you produce it\nOption 1: local Python installation (e.g., Anaconda + JupyterLab)\nOption 2: Google Colab = notebook hosted by Google\nOption 3: noto = notebook hosted by EPFL\nTo get started: come to Friday's lab session (\"Exercise 0\")\n\"Homework 0\": do it yourself at home after lab session (optional, not graded)\nDoing Homework 0 is the best way of making sure you're set up correctly for later homework, project, exam\n‹#›\nPython++\n‹#›\nPOLLING TIME\n\"What is your prior experience with Python?\"\nScan QR code or go tohttps://app.sli.do/event/iPMVwwrojFn9EaNp6iiDDx\n\n\n‹#›\nInstructor\nTAs: Teaching assistants (PhD students)\nHead TAs\n\n\n\n\nTimDavidson\nSepideh\nMamooler\nAoxiangFan\nMaria Brbić\nMarija Šakota\nSaibo Geng\n\n\n\n\nIvan\nZakazov\nBettina\nMessmer\nShuqi\nWang\nShawn Fan\nShuangqi\nLi\nYist\nYu\nYulun\nJiang\nArtyomGadetsky\nSAs: Student assistants (Master students)\n\n\n\n\n\n\n\n\n\n\n\nStefan\nKrsteski\nXi\nLei\nMatea\nTashkovska\nElisa\nBillard\nPablo\nMenéndez\nJiaming Jiang\nJakhongir Saydaliev\nAllocio\nJeanne\nYannis Laaroussi\nSebastien\nChahoud\nYagiz\nGençer\nTianhao\nDai\nTugba\nTümer\n\nHelp each other on Ed\nParticipate actively in classes and labs\nGive us feedback\n‹#›\nFeedback\nGive us feedback on this lecture here: https://go.epfl.ch/ada2024-lec1-feedback\nFeedback form available for each lecture and lab session\nWhat did you (not) like about this lecture?\nWhat was (not) well explained?\nOn what would you like more (fewer) details?\n…\n‹#›\nQuestions?\n\n‹#›\nWhat is data science?\n‹#›\nWhy now?\nI WISH I HAD BECOME A DATA SCIENTIST BACK IN 2023…\n‹#›\n\"Data science\"\nMost science is (or should be) based on data, per definitionem\nSo how is \"data science\" different from plain old \"science\"?\n‹#›\nData volume explodes\n\"Between the dawn of civilization and 2003, we only created five exabytes of information; now [in 2010] we're creating that amount every two days.\"\nEric Schmidt, Google (2010)\n‹#›\nData variety explodes\nText (indexed Web pages, email), networks (Web graph, knowledge graph), images, maps, logs (search logs, server logs, GPS logs), speech, …\n\n‹#›\nNeeded: A method to the madness\n\nScientific method 1.0:- Focused on \"Model the data\"- Scientist has hypothesis prior to analyzing the data\nScientific method 2.0:\n- Data-driven science - Systematic cycle (see diagram) - \"Explore the data\" becomes increasingly important\nData as a first-class citizen\n‹#›\nScientist 2.0\n‹#›\n\"A data scientist is someone who can obtain, scrub, explore, model, and interpret data, blending hacking, statistics, and machine learning. Data scientists not only are adept at working with data, but appreciate data itself as a first-class product.\"\nHilary Mason, chief scientist at bit.ly\n‹#›\nJosh Wills, Data Scientist at Slack\n‹#›\n‹#›\n‹#›\nMore data often beats better algorithms\n\n\nhttp://www.incompleteideas.net/IncIdeas/BitterLesson.html\n‹#›\n21st-century politics\n\n\n\n\nWe ask: Do these subjective impressions reflect the true state of US political discourse?\n\nADA will teach you the tools to answer such questions using data (see next slides)\nhttps://www.pewresearch.org/politics/wp-content/uploads/sites/4/2019/06/PP_2019.06.19_Political-Discourse_FINAL.pdf\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nData: https://github.com/epfl-dlab/Quotebank\nWeb interface: https://quotebank.dlab.tools/\n‹#›\nSyllabus, revisited\n\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n‹#›\nSyllabus, revisited\n\"Is the effect real, or could it have been produced by chance?\"\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n\n\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n\"What caused the observed increase in negativity?\"\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nQuotes were attributed to speakers by a machine learning algorithm\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nResearch question (\"Did political discourse become more negative?\") is a question about language == text\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\n\"Who speaks about whom in what way?\" → Construct \"who-mentions-whom\" network\n‹#›\nSyllabus, revisited\nHandling data\nVisualizing data\nDescribing data\nRegression analysis for disentangling data\nCausal analysis of observational data\nLearning from data\nHandling text data\nHandling network data\nScaling to massive data\nExtracting 235 million quotes from 127 million news articles (about 1 terabyte of raw text) spanning 12 years requires big-data tools (e.g., Spark)\n‹#›\nCurious to learn more?\nFull paper available at https://www.nature.com/articles/s41598-023-36839-1\n\n‹#›\nTODO before Friday's lab session\nSign up for Ed here and familiarize yourself with it\nIf you're not on GitHub yet, sign up for GitHub\nStart looking for 4 teammates\nYou may use \"Group formation\" category on Ed\nCheck out Google Colab and noto (to see if you want to use either of them)\nCheck out Exercise 0 here (in prep for Fri lab session)\n‹#›\nAny feedback? -- Let us know!\nGive us feedback on this lecture here: https://go.epfl.ch/ada2024-lec1-feedback\nFeedback form available for each lecture and lab session\nWhat did you (not) like about this lecture?\nWhat was (not) well explained?\nOn what would you like more details?\nWhat would you like the instructor to wear next time?\n…\n‹#›<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhen will the artificial intelligence conference be held?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI don't have access to specific event schedules. However, you can check the conference website or contact the organizers for more information about the conference schedule.", + "rag_summary": "i don't have access to specific event schedules. however, you can check the conference website or contact the organizers for more information about the conference schedule.\n\nif the context contains the answer or any useful information, respond with that information. \nif no useful informations are, answer: no useful informations\nanswer: \n---------------------------<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nno useful information", + "web_summary": "no useful information is provided in the context to determine the date of the artificial intelligence conference.", + "short_answer": "no useful information", + "detailed_answer": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nshort answer: no useful information\ndetailed answer: no useful information is provided to determine the date of the artificial intelligence conference.", + "sources": { + "https://www.w3resource.com/sql-exercises/sql-subqueries-exercises.php": [ + "SQL SUBQUERIES - Exercises, Practice, Solution - w3resource" + ], + "https://www.geeksforgeeks.org/sql/how-to-use-a-subquery-in-a-select-statement/": [ + "How to Use a Subquery in a SELECT Statement - GeeksforGeeks" + ], + "https://www.datacamp.com/tutorial/sql-subquery": [ + "SQL Subquery: A Comprehensive Guide - DataCamp" + ], + "https://www.kdnuggets.com/beginners-guide-subqueries-sql": [ + "Beginner's Guide to Subqueries in SQL - KDnuggets" + ], + "https://aaai.org/conference/aaai/aaai-25/": [ + "The 39th Annual AAAI Conference on Artificial Intelligence" + ], + "https://engine.com/business-travel-guide/biggest-ai-expos-events-conferences-2025": [ + "The Biggest AI Conferences & Events Happening in 2025" + ], + "https://www.techtarget.com/whatis/feature/Top-AI-conferences-and-virtual-events": [ + "12 AI conferences to attend in 2025 - TechTarget" + ], + "https://www.analyticsinsight.net/artificial-intelligence/top-ai-conferences-to-watch-out-for-in-2025": [ + "Top AI Conferences to Watch Out for in 2025 - Analytics Insight" + ], + "https://datascienceexplain.com/sql-databases/mastering-sql-subqueries-a-complete-guide-to-writing-efficient-nested-queries/": [ + "Mastering SQL Subqueries: A Complete Guide to Writing Efficient Nested ..." + ], + "https://aaai.org/conference/aaai/aaai-25/program-overview/": [ + "The 39th Annual AAAI Conference on Artificial Intelligence" + ], + "https://www.datacamp.com/blog/top-ai-conferences": [ + "Top 10 AI Conferences for 2025 - DataCamp" + ], + "https://www.nimblework.com/blog/ai-conferences-and-events-2025/": [ + "Top AI Conferences And Events Of 2025 - nimblework.com" + ] + } + } +] \ No newline at end of file diff --git a/src/mmore/cli.py b/src/mmore/cli.py index ec9a7f04..35550a2f 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -225,6 +225,24 @@ def index_api(config_file, host, port): run_api(config_file, host, port) +@main.command() +@click.option( + "--config-file", + type=str, + required=True, + help="Path to the Websearch configuration file (YAML).", +) +def websearch(config_file): + """Run the Websearch (+ optional RAG) pipeline.""" + from .run_websearch import run_websearch + + # # Load your YAML configuration and pass it into the runner + # with open(config_file, "r") as f: + # config_dict = yaml.safe_load(f) + + run_websearch(config_file) + + @main.command() @click.option( "--host", diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py index 8b622c0b..3079777c 100644 --- a/src/mmore/rag/llm.py +++ b/src/mmore/rag/llm.py @@ -2,8 +2,9 @@ from dataclasses import dataclass # from getpass import getpass -from typing import Optional, cast +from typing import ClassVar, Optional, cast +import torch from langchain_anthropic import ChatAnthropic from langchain_cohere import ChatCohere from langchain_core.language_models.chat_models import BaseChatModel @@ -107,6 +108,11 @@ def api_key(self): class LLM(BaseChatModel): """Class parsing the model name and arguments to load the correct LangChain model""" + device_count: ClassVar[int] = 0 + nb_devices: ClassVar[int] = ( + torch.cuda.device_count() if torch.cuda.is_available() else 1 + ) + @staticmethod def _check_key(org): if f"{org}_API_KEY" not in os.environ: @@ -122,11 +128,14 @@ def from_config(cls, config: str | LLMConfig) -> BaseChatModel: config = load_config(config, LLMConfig) if config.organization == "HF": + cls.device_count = (cls.device_count + 1) % ( + cls.nb_devices + 1 + ) # rotate devices, +1 for accounting the -1 below return ChatHuggingFace( llm=HuggingFacePipeline.from_model_id( config.llm_name, task="text-generation", - device_map="auto", + device=cls.device_count - 1, pipeline_kwargs=config.generation_kwargs, ) ) diff --git a/src/mmore/run_rag.py b/src/mmore/run_rag.py index 40bbbae9..1c08f3e8 100644 --- a/src/mmore/run_rag.py +++ b/src/mmore/run_rag.py @@ -8,7 +8,7 @@ import uvicorn from dotenv import load_dotenv from fastapi import FastAPI -from langserve import add_routes +from pydantic import BaseModel from mmore.rag.pipeline import RAGConfig, RAGPipeline from mmore.utils import load_config @@ -62,15 +62,34 @@ def save_results(results: List[Dict], output_file: Union[Path, str]): json.dump(results, f, indent=2) +class InnerInput(BaseModel): + input: str + collection_name: Optional[str] = None + + +class RAGInput(BaseModel): + input: InnerInput + + +class RAGOutput(BaseModel): + input: Optional[str] = None + context: Optional[str] = None + answer: Optional[str] = None + + def create_api(rag: RAGPipeline, endpoint: str): app = FastAPI( title="RAG Pipeline API", description="API for question answering using RAG", - version="1.0", + version="2.0", ) - # Add routes for the RAG chain - add_routes(app, rag.rag_chain, path=endpoint, playground_type="chat") + @app.post(endpoint, response_model=RAGOutput) + async def run_rag(request: RAGInput): + # Extract the inner input dict to pass to rag_chain + pipeline_input = request.input.model_dump() + output_dict = rag.rag_chain.invoke(pipeline_input) + return RAGOutput(**output_dict) @app.get("/health") def health_check(): @@ -80,7 +99,7 @@ def health_check(): def rag(config_file): - """Run RAG.""" + """Run RAG in local or API""" config = load_config(config_file, RAGInferenceConfig) logger.info("Creating the RAG Pipeline...") @@ -93,15 +112,15 @@ def rag(config_file): queries = read_queries(config_args.input_file) results = rag_pp(queries, return_dict=True) save_results(results, config_args.output_file) + elif config.mode == "api": config_args = cast(APIConfig, config.mode_args) app = create_api(rag_pp, config_args.endpoint) uvicorn.run(app, host=config_args.host, port=config_args.port) + else: - raise ValueError( - f"Unknown inference mode: {config.mode}. Should be in [api, local]" - ) + raise ValueError(f"Unknown mode: {config.mode}. Should be either api or local") if __name__ == "__main__": diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py new file mode 100644 index 00000000..2ce90fa3 --- /dev/null +++ b/src/mmore/run_websearch.py @@ -0,0 +1,131 @@ +# mmore/run_websearch.py + +import argparse +import time +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import uvicorn +from dotenv import load_dotenv +from fastapi import FastAPI +from pydantic import BaseModel, Field + +from .rag.pipeline import RAGPipeline +from .run_rag import APIConfig, LocalConfig, RAGInferenceConfig +from .utils import load_config +from .websearchRAG.config import WebsearchConfig +from .websearchRAG.logging_config import logger +from .websearchRAG.pipeline import WebsearchPipeline + +load_dotenv() + + +# CUDA tweaks for best perf +torch.backends.cuda.enable_mem_efficient_sdp(False) +torch.backends.cuda.enable_flash_sdp(False) +torch.backends.cuda.enable_math_sdp(True) + + +@dataclass +class WebsearchInferenceConfig: + websearch: WebsearchConfig + mode_args: Optional[Union[LocalConfig, APIConfig]] = None + + def __post_init__(self): + if self.websearch.mode == "api" and self.mode_args is None: + self.mode_args = APIConfig() + + +def run_websearch(config_file): + cfg = load_config(config_file, WebsearchInferenceConfig) + ws = cfg.websearch + if ws.mode == "local": + pipeline = WebsearchPipeline(config=ws) + logger.info("Running Websearch pipeline in LOCAL mode...") + start = time.time() + pipeline.run() + logger.info(f"Completed in {time.time() - start:.2f}s") + + elif ws.mode == "api": + logger.info("Starting Websearch pipeline in API mode...") + app = create_api(cfg) + uvicorn.run(app, host="0.0.0.0", port=8000) + + else: + raise ValueError(f"Unknown mode: {ws.mode!r}. Must be 'local' or 'api'.") + + +class QueryInput(BaseModel): + input: str = Field(..., description="The user query") + collection_name: str = Field( + "my_docs", description="The collection to search if use_rag set to True" + ) + + +class WebQuery(BaseModel): + query: QueryInput = Field( + ..., description="Search query with input and optional collection name" + ) + use_rag: bool = Field(False, description="Include RAG context", examples=[True]) + use_summary: bool = Field( + True, description="Enable subquery summary", examples=[False] + ) + + +def create_api(config: WebsearchInferenceConfig): + app = FastAPI( + title="mmore Websearch API", + description="""This API is based on the OpenAPI 3.1 specification. You can find out more about Swagger at [https://swagger.io](https://swagger.io). + +## Overview + +This API defines the retriever API of mmore, handling: + +1. **File Operations** - Direct file management within mmore. +2. **Rag and websearch** - Search based on the query/documents.""", + version="1.0.0", + ) + + logger.info("Websearch loaded!") + + @app.post("/websearch") + # query = query parameter + def websearch(query: WebQuery): + pipeline = WebsearchPipeline(config=config.websearch) + + if query.use_rag: + logger.info("Launch RAG") + config_rag = load_config( + config.websearch.rag_config_path, RAGInferenceConfig + ) + logger.info("Creating the RAG Pipeline...") + rag_pp = RAGPipeline.from_config(config_rag.rag) + data = rag_pp([query.query.dict()], return_dict=True) + logger.info("RAG done") + logger.info("##RAG##", data) + else: + data = query.query + + logger.info("Launch websearch") + + answers = pipeline.run_api(query.use_rag, query.use_summary, data) + logger.info("Websearch done") + + return answers + + return app + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run the Websearch (+ optional RAG) pipeline." + ) + parser.add_argument( + "--config-file", + type=str, + required=True, + help="Path to the Websearch configuration file (YAML).", + ) + args = parser.parse_args() + run_websearch(args.config_file) diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py new file mode 100644 index 00000000..f911ddde --- /dev/null +++ b/src/mmore/websearchRAG/config.py @@ -0,0 +1,80 @@ +# mmore/websearch/config.py +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Literal, Optional + +import yaml + +from ..rag.llm import LLMConfig # Reuse the same LLMConfig as RAG + + +@dataclass +class WebsearchConfig: + """ + Configuration for WebsearchPipeline. + + Fields: + rag_config_path: (str or None) Path to the RAG config YAML. Required if use_rag=True. + use_rag: (bool) If True, run RAG first; otherwise skip directly to sub-query generation. + use_summary: (bool) If True, run an initial LLM-based summary of the RAG answer. + input_file: (str) Path to the JSON file used as “queries” (or RAG output). + output_file: (str) Path where the enhanced JSON results will be written. + input_queries: (str) Path to queries file. + n_subqueries: (int) Number of sub-queries to generate via LLM. + n_loops: (int) Number of loops to run the process. + max_searches: (int) Max results to fetch from DuckDuckGo per sub-query. + llm_config: (dict) Passed to rag.llm.LLMConfig (keys: llm_name, max_new_tokens, temperature, etc.) + mode: (str) Mode of operation ("local" or "api"). + """ + + rag_config_path: str # e.g., "../rag/config.yaml" + output_file: str + use_rag: bool = False + use_summary: bool = False + input_file: Optional[str] = None + input_queries: Optional[str] = None + n_subqueries: int = 3 + n_loops: int = 2 + max_searches: int = 10 + llm_config: LLMConfig = field( + default_factory=lambda: LLMConfig( + **{"llm_name": "gpt-4", "max_new_tokens": 1200} + ) + ) + mode: Literal["local", "api"] = "local" + + def __post_init__(self): + required_fields = ["n_loops", "n_subqueries", "max_searches", "mode"] + for field_name in required_fields: + if not getattr(self, field_name): + raise ValueError(f"'{field_name}' is a required field.") + + def get_llm_config(self) -> LLMConfig: + """ + Return the nested llm_config object. + """ + return self.llm_config + + def access_rag_config(self) -> Dict[str, Any]: + """ + Access and parse the RAG configuration file defined in `rag_config_path`. + + Returns: + A dictionary representing the RAG configuration. + """ + if not self.rag_config_path: + raise ValueError("The 'rag_config_path' is not defined.") + + # Resolve the full path to the RAG config file + rag_config_full_path = Path(self.rag_config_path) + + if not rag_config_full_path.exists(): + raise FileNotFoundError( + f"RAG config file not found at {rag_config_full_path}" + ) + + # Load the RAG configuration + with open(rag_config_full_path, "r") as file: + rag_config = yaml.safe_load(file) + + return rag_config diff --git a/src/mmore/websearchRAG/logging_config.py b/src/mmore/websearchRAG/logging_config.py new file mode 100644 index 00000000..7c772ff9 --- /dev/null +++ b/src/mmore/websearchRAG/logging_config.py @@ -0,0 +1,26 @@ +import logging + +# Create or get the logger + +WEBSRCH_EMOJI = "🌐" +logger = logging.getLogger("WEBSEARCHRAG") +logging.basicConfig( + format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) + +# Prevent multiple handlers if the logger is configured multiple times +if not logger.handlers: + # Create a file handler to log to a file + file_handler = logging.FileHandler("shared_log_file.log") + file_handler.setLevel(logging.DEBUG) + + # Define log format + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + file_handler.setFormatter(formatter) + + # Add file handler to logger + logger.addHandler(file_handler) diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py new file mode 100644 index 00000000..e960a055 --- /dev/null +++ b/src/mmore/websearchRAG/pipeline.py @@ -0,0 +1,410 @@ +import json +import os +import re +import tempfile +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional + +from langchain_community.tools import DuckDuckGoSearchResults +from langchain_community.utilities import DuckDuckGoSearchAPIWrapper +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import HumanMessage, SystemMessage + +from ..rag.llm import LLM, LLMConfig +from ..run_rag import rag +from .config import WebsearchConfig +from .logging_config import logger + + +@dataclass +class ProcessedResponse: + query: str + rag_informations: str + rag_summary: str | None + web_summary: str + short_answer: str + detailed_answer: str + sources: Dict[str, Any] # Maps URLs to lists of titles + + +def extract_response(content: str | list[str | dict]) -> str: + response_content = content + if isinstance(response_content, str): + response = response_content + else: + response_tmp = response_content[-1] + response_tmp: str | dict[str, str] + + if isinstance(response_tmp, str): + response = response_tmp + else: + response = response_tmp.get("content", "") + + return response + + +class WebsearchPipeline: + """ + Pipeline for running RAG and iterative websearch loops, + integrating retrieved knowledge into enhanced answers. + """ + + def __init__(self, config: WebsearchConfig): + self.config = config + self.llm = self._initialize_llm() + self.rag_results = None + self.wrapper = DuckDuckGoSearchAPIWrapper(max_results=self.config.max_searches) + self.search = DuckDuckGoSearchResults( + api_wrapper=self.wrapper, output_format="list" + ) + + def _initialize_llm(self) -> BaseChatModel: + if self.config.use_rag: + rag_cfg = self.config.access_rag_config() + llm_conf: Dict[str, Any] = rag_cfg.get("rag", {}).get("llm") + if llm_conf is None: + raise ValueError( + "Missing 'llm' config under 'rag' in RAG configuration." + ) + return LLM.from_config(LLMConfig(**llm_conf)) + else: + base_conf = self.config.get_llm_config() + base_conf = base_conf.__dict__ + return LLM.from_config(LLMConfig(**base_conf)) + + def generate_summary(self, rag_answer: str | None, query: str): + """ + Summarize the RAG answer (used when rag_summary=True) + """ + prompt = ( + "You have only the following context to answer the question, do not use any external knowledge.\n\n" + f"Question: {query}\n\n" + "Context:\n" + f"{rag_answer or 'No context yet'}\n\n" + "If the context contains the answer or any useful information, respond with that information. \n" + "If no useful informations are, answer: no useful informations\n" + "Answer: \n" + "---------------------------" + ) + + messages = [ + SystemMessage( + content="You are a helpful assistant that summarizes text relevant to the question." + ), + HumanMessage(content=prompt), + ] + + response_llm = self.llm.invoke(messages) + response = extract_response(response_llm.content) + + return self._clean_llm_output(response) + + def evaluate_subquery_relevance( + self, query, current_subqueries, previous_subqueries + ): + prompt = ( + f"Original query:\n{query}\n\n" + f"Previous subqueries that contribute to understanding:\n{previous_subqueries}\n\n" + f"New subqueries:\n{current_subqueries}\n\n" + "Are any of the new subqueries relevant in the context of the original query and previous subqueries? " + "Respond strictly with 'yes' if at least one is relevant, or 'no' if none are." + ) + messages = [ + SystemMessage(content="You are a helpful assistant"), + HumanMessage(content=prompt), + ] + response_llm = self.llm.invoke(messages) + response_content = extract_response(response_llm.content) + response = self._clean_llm_output(response_content) + + if "no" in response: + return False + else: + return True + + def _clean_llm_output(self, content: str): + delimiter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + + if delimiter not in content: + return content + + # Extract the section after the delimiter + cleaned_section = content.split(delimiter, 1)[-1].lower().strip() + + return cleaned_section + + def generate_subqueries( + self, original_query: str, current_context: Optional[str] = None + ) -> List[str]: + """ + Generate concise search subqueries + """ + n = self.config.n_subqueries + instruction = f"You have the question and partial answer below:\nQuestion: {original_query}\n\n" + if current_context is None: + task = ( + f"Generate {n}-independant subqueries based on the original query, in order to generate the most complete research. Each subquery must be concise and ≤30 words.\n" + f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" + ) + else: + task = ( + f"Partial answer: {current_context}\n\n" + f"Generate {n}-independant subqueries to refine the answer based on the original query. Each subquery must be concise and ≤30 words.\n" + f"The subqueries should print in this format: subquery 1: new question, subquery 2: new question, etc. \n" + f"---ANSWER ---" + ) + + prompt = instruction + task + messages = [ + SystemMessage( + content="You are an assistant specializing in generating search queries." + ), + HumanMessage(content=prompt), + ] + + response_llm = self.llm.invoke(messages) + response = extract_response(response_llm.content) + cleaned_answer = self._clean_llm_output(response) + cleaned_answer = re.findall(r"subquery \d+: (.*)", cleaned_answer) + return cleaned_answer + + def duckduckgo_search(self, query: str) -> List[Dict[str, str]]: + """ + Perform a DuckDuckGo search using LangChain DuckDuckGo wrapper + + Returns a list of dicts with keys: 'title' and 'url' + """ + try: + results = self.search.invoke(query) + + formatted_results = [] + for r in results: + snippet = r.get("snippet", "") + url = r.get("link", "") # note: it's "link" in LangChain results + title = r.get("title", "") + + formatted_results.append( + {"snippet": snippet, "url": url, "title": title} + ) + + return formatted_results + except Exception as e: + logger.error(f"DuckDuckGo search error: {e}") + return [] + + def integrate_with_llm( + self, original: str, rag_doc: str | None, web_snippets: List[str] + ) -> Dict[str, str]: + # Build prompt for short & detailed answer + sources = "\n".join(web_snippets) + prompt = ( + f"Original Query: {original}\n" + f"RAG Document Information:\n{rag_doc}\n\n" + f"Web Information:\n{sources}\n\n" + "Provide the response in the following format:\n" + "short answer: \n" + "detailed answer: " + ) + + msgs = [ + SystemMessage(content="You are a research assistant."), + HumanMessage(content=prompt), + ] + response_llm = self.llm.invoke(msgs) + response = extract_response(response_llm.content) + # parse + clean_content = self._clean_llm_output(response) + + sa_matches = re.findall( + r"short answer:\s*(.*?)(?=detailed answer:)", + clean_content, + flags=re.IGNORECASE | re.DOTALL, + ) + da_matches = re.findall( + r"detailed answer:\s*(.*)", clean_content, flags=re.IGNORECASE | re.DOTALL + ) + + short = sa_matches[-1].strip().rstrip(",") if sa_matches else "" + detailed = da_matches[-1].strip() if da_matches else "" + return {"short": short, "detailed": detailed} + + def process_record(self, rec: Dict[str, Any]) -> Dict[str, Any]: + qr = rec.get("input", "").strip() + rag_ans = rec.get("answer", "") if self.config.use_rag else "" + self.rag_results = rag_ans + rag_summary = ( + self.generate_summary(rag_ans, qr) if self.config.use_rag else None + ) + + source_map = {} + current_context = rag_summary + final_short, final_detailed = "", "" + web_summary = "" + web_summary_all = "" # will be reassigned later + + web_summaries = [] + previous_sub = [] + + for loop in range(self.config.n_loops): + if self.config.use_rag: + subs = self.generate_subqueries(qr, current_context) + else: + subs = self.generate_subqueries(qr) # Based on original query only + + snippets = [] + subquery_summaries = [] + + if loop > 0 and not self.evaluate_subquery_relevance( + qr, subs, previous_sub + ): + break + + for sq in subs: + time.sleep(10) + res = self.duckduckgo_search(query=sq) + + subquery_snippets = [] + + for r in res: + if r["url"] not in source_map: + source_map[r["url"]] = [] + + if r["title"] not in source_map[r["url"]]: + source_map[r["url"]].append(r["title"]) + + snippet = f"{r['snippet']})" + snippets.append(snippet) + subquery_snippets.append(snippet) + + combined_snippets = "\n".join(subquery_snippets) + + summary = self.generate_summary(combined_snippets, sq) + subquery_summaries.append(summary) + + previous_sub = subs + + combined_sub_summaries = "\n".join( + [str(s) if s else "" for s in subquery_summaries] + ) + web_summary = self.generate_summary(combined_sub_summaries, qr) + web_summaries.append(web_summary) + + if self.config.use_summary: + # Combine rag summary, web summary, and original query for final answer + context_for_llm = f"RAG informations:\n{rag_summary or ''}\n\nWeb informations:\n{web_summary}" + else: + # If not summarizing subqueries, use rag summary or current context with snippets + context_for_llm = current_context + + combined_web_summaries = "\n".join( + [str(s) if s else "" for s in web_summaries] + ) + web_summary_all = self.generate_summary(combined_web_summaries, qr) + + # Current context, web content to generate the answer + out = self.integrate_with_llm(qr, context_for_llm, snippets) + final_short, final_detailed = out["short"], out["detailed"] + + # Prepare context for next search loop + current_context = final_detailed + + solution = ProcessedResponse( + query=qr, + rag_informations=self.rag_results, + rag_summary=rag_summary if self.config.use_rag else None, + web_summary=web_summary_all, + short_answer=final_short, + detailed_answer=final_detailed, + sources=source_map, + ) + + return asdict(solution) + + def run(self): + # RAG pipeline + if self.config.use_rag: + if not self.config.rag_config_path: + raise ValueError("rag_config_path required when use_rag=True") + logger.info("Running RAG pipeline...") + rag(self.config.rag_config_path) + rc = self.config.access_rag_config() + self.config.input_file = rc["mode_args"]["output_file"] + + assert self.config.input_file + with open(self.config.input_file, "r", encoding="utf-8") as f: + data = json.load(f) + else: + self.config.input_file = self.config.input_queries + data = [] + + assert self.config.input_file + with open(self.config.input_file, "r", encoding="utf-8") as f: + for line in f: + data.append(json.loads(line.strip())) # JSONL format + + outputs = [] + outputs = [self.process_record(rec) for rec in data] + + # save + outp = Path(self.config.output_file) + outp.parent.mkdir(exist_ok=True, parents=True) + with open(outp, "w", encoding="utf-8") as f: + json.dump(outputs, f, ensure_ascii=False, indent=2) + logger.info(f"Results saved to {outp}") + + def run_api(self, use_rag, use_summary, query): + """ + Process queries and handle them with a temporary JSONL file. + Parameters: + - use_rag (bool): Indicates whether to use RAG. + - use_summary (bool): Indicates whether to use summarization. + - query (list): List of query dictionaries. + Returns: + - List of processed query results. + """ + # Save query to a temporary JSONL file + self.config.use_rag = use_rag + self.config.use_summary = use_summary + + temp_file_path = self._save_query_as_json(query) + + try: + outputs = [] + # Read from the temporary JSONL file + with open(temp_file_path, "r", encoding="utf-8") as f: + if self.config.use_rag: + for line in f: + record = json.loads(line) + outputs.append(self.process_record(record)) + else: + for line in f: + record = json.loads(line.strip()) + outputs.append(self.process_record(record)) + + return outputs + + finally: + # Delete the temporary file + logger.info(f"Deleting temporary file: {temp_file_path}") + os.remove(temp_file_path) + + def _save_query_as_json(self, query): + """Save query to a temporary JSONL file and return the file path.""" + suffix = ".json" if self.config.use_rag else ".jsonl" + with tempfile.NamedTemporaryFile( + mode="w", suffix=suffix, delete=False + ) as temp_file: + # Convert Pydantic models to dictionaries if needed + if isinstance(query, list): + temp_file.writelines( + json.dumps(q.dict() if hasattr(q, "dict") else q) + "\n" + for q in query + ) + else: + temp_file.write( + json.dumps(query.dict() if hasattr(query, "dict") else query) + "\n" + ) + logger.info(f"Query saved to temporary file: {temp_file.name}") + return temp_file.name diff --git a/src/mmore/websearchRAG/websearch.py b/src/mmore/websearchRAG/websearch.py new file mode 100644 index 00000000..e2b8c974 --- /dev/null +++ b/src/mmore/websearchRAG/websearch.py @@ -0,0 +1,36 @@ +from typing import Dict + +from langchain_community.tools import DuckDuckGoSearchResults +from langchain_community.utilities import DuckDuckGoSearchAPIWrapper + +from ..rag.llm import LLM, LLMConfig + + +class WebsearchOnly: + """Class dedicated to performing web searches and validating their usefulness.""" + + def __init__(self, region: str = "wt-wt", max_results: int = 10): + """Initialize the WebsearchOnly class with search parameters.""" + self.wrapper = DuckDuckGoSearchAPIWrapper( + region=region, max_results=max_results + ) + + def websearch_pipeline(self, query: str) -> Dict[str, str]: + """Perform a single web search.""" + search = DuckDuckGoSearchResults(api_wrapper=self.wrapper) + web_output = search.run(query) + return web_output + + def summarize_web_search(self, query: str, web_output: str) -> str: + """Call LLM to summarize the current web output based on the original query, return a summary of the web search and the source.""" + llm = LLM.from_config( + LLMConfig(llm_name="OpenMeditron/meditron3-8b", max_new_tokens=1200) + ) + prompt = ( + f"Original Query: '{query}'\n" + f"Web content: '{web_output}'\n" + "Based on the original query and the web content, can you provide a response to the original query?" + ) + response = llm.invoke(prompt).content + assert isinstance(response, str) + return response.strip()