fix bug

2025-02-04 23:30:47 +08:00
parent c90acade45
commit 1acdc60b9e
4 changed files with 180 additions and 126 deletions
--- a/src/agent/custom_agent.py
+++ b/src/agent/custom_agent.py
@@ -270,7 +270,8 @@ class CustomAgent(Agent):
            self._last_result = result
            self._last_actions = actions
            if len(result) > 0 and result[-1].is_done:
-                self.extracted_content += step_info.memory
+                if not self.extracted_content:
+                    self.extracted_content = step_info.memory
                result[-1].extracted_content = self.extracted_content
                logger.info(f"📄 Result: {result[-1].extracted_content}")

@@ -346,7 +347,10 @@ class CustomAgent(Agent):
                    break
            else:
                logger.info("❌ Failed to complete task in maximum steps")
-                self.history.history[-1].result[-1].extracted_content = self.extracted_content
+                if not self.extracted_content:
+                    self.history.history[-1].result[-1].extracted_content = step_info.memory
+                else:
+                    self.history.history[-1].result[-1].extracted_content = self.extracted_content

            return self.history

--- a/src/controller/custom_controller.py
+++ b/src/controller/custom_controller.py
@@ -1,3 +1,5 @@
+import pdb
+
 import pyperclip
 from typing import Optional, Type
 from pydantic import BaseModel
@@ -21,10 +23,11 @@ import logging

 logger = logging.getLogger(__name__)

+
 class CustomController(Controller):
    def __init__(self, exclude_actions: list[str] = [],
-                output_model: Optional[Type[BaseModel]] = None
-                ):
+                 output_model: Optional[Type[BaseModel]] = None
+                 ):
        super().__init__(exclude_actions=exclude_actions, output_model=output_model)
        self._register_custom_actions()

@@ -44,7 +47,7 @@ class CustomController(Controller):
            await page.keyboard.type(text)

            return ActionResult(extracted_content=text)
-        
+
        @self.registry.action(
            'Extract page content to get the pure text or markdown with links if include_links is set to true',
            param_model=ExtractPageContentAction,
@@ -52,12 +55,17 @@ class CustomController(Controller):
        )
        async def extract_content(params: ExtractPageContentAction, browser: BrowserContext):
            page = await browser.get_current_page()
+            # use jina reader
+            url = page.url
+            jina_url = f"https://r.jina.ai/{url}"
+            await page.goto(jina_url)
            output_format = 'markdown' if params.include_links else 'text'
            content = MainContentExtractor.extract(  # type: ignore
                html=await page.content(),
                output_format=output_format,
            )
-            title = await page.title()
-            msg = f'📄  Page url: {page.url}, Page title: {title}, Extracted page content as {output_format}\n: {content}\n'
+            # go back to org url
+            await page.go_back()
+            msg = f'📄  Extracted page content as {output_format}\n: {content}\n'
            logger.info(msg)
            return ActionResult(extracted_content=msg)
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -3,6 +3,7 @@ import os
 import time
 from pathlib import Path
 from typing import Dict, Optional
+import requests

 from langchain_anthropic import ChatAnthropic
 from langchain_mistralai import ChatMistralAI
--- a/tests/test_deep_research.py
+++ b/tests/test_deep_research.py
@@ -21,170 +21,214 @@ from json_repair import repair_json
 from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
 from src.controller.custom_controller import CustomController

-# define task
-task = "中文写一篇关于中美AI竞赛的论文, 分析二者会在哪些AI领域进行竞争和协作, 2000个字以上"
-task_id = uuid4().__str__()
-save_dir = os.path.join(f"./tmp/deep_research/{task_id}")
-os.makedirs(save_dir, exist_ok=True)

-# llm = utils.get_llm_model(provider="gemini", model_name="gemini-2.0-flash-thinking-exp-01-21", temperature=0.7)
-llm = utils.get_llm_model(provider="deepseek", model_name="deepseek-reasoner", temperature=0.7)
-llm_bu = utils.get_llm_model(provider="azure_openai", model_name="gpt-4o", temperature=0.7)
-# 搜索的信息
-search_infos = ""
-# 搜索的LLM历史信息
-max_query_num = 3
-search_system_prompt = f"""
-You are an expert task planner for an AI agent that uses a web browser with **automated execution capabilities**. Your goal is to analyze user instructions and, based on available information, 
-determine what further search queries are necessary to fulfill the user's request. You will output a JSON object with the following structure:
+async def deep_research():
+    # define task
+    task = "Write a report on RPA (Robotic Process Automation) technology in English, from all espects, more than 2,000 words"
+    task_id = uuid4().__str__()
+    save_dir = os.path.join(f"./tmp/deep_research/{task_id}")
+    os.makedirs(save_dir, exist_ok=True)

-[
-    "search query 1",
-    "search query 2",
-    //... up to a maximum of {max_query_num} search queries
-]
-```
+    llm = utils.get_llm_model(provider="gemini", model_name="gemini-2.0-flash-thinking-exp-01-21", temperature=0.7)
+    # llm = utils.get_llm_model(provider="deepseek", model_name="deepseek-reasoner", temperature=0.7)
+    llm_bu = utils.get_llm_model(provider="azure_openai", model_name="gpt-4o", temperature=0.7)

-Here's an example of the type of `search` tasks we are expecting:
-[
-    "weather in Tokyo",
-    "cheap flights to Paris"
-]
-```
+    # 搜索的LLM历史信息
+    max_query_num = 3
+    search_system_prompt = """
+    You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.

-**Important:**
+    **Your Task:**

-*   Your output should *only* include search queries as strings in a JSON array. Do not include other task types like navigate, click, extract, etc.
-*   Limit your output to a **maximum of {max_query_num}** search queries.
-*   Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
-*   If you have gathered for all the information you want and no further search queries are required, output an empty list: `[]`
-*   Make sure output search queries are different from the previous queries.
+    Given a user's research topic, you will:

-**Inputs:**
+    1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction.
+    2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan.

-1.  **User Instruction:** The original instruction given by the user.
-2.  **Previous Queries:** History Queries.
-3.  **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
-"""
-search_messages = [SystemMessage(content=search_system_prompt)]
-# 记录和总结的历史信息，保存到raw_infos
-record_system_prompt = """
-You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a concise textual summary of new information.
+    **Output Format:**
+
+    Your output will be a JSON object with the following structure:
+
+    ```json
+    {
+    "plan": "A concise, high-level research plan outlining the key areas to investigate.",
+      "queries": [
+        "search query 1",
+        "search query 2",
+        //... up to a maximum of 3 search queries
+      ]
+    }
+    ```
+
+    **Important:**
+
+    *   Limit your output to a **maximum of 3** search queries.
+    *   Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
+    *   If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]`
+    *   Make sure output search queries are different from the history queries.
+
+    **Inputs:**
+
+    1.  **User Instruction:** The original instruction given by the user.
+    2.  **Previous Queries:** History Queries.
+    3.  **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
+    """
+    search_messages = [SystemMessage(content=search_system_prompt)]
+
+    # 记录和总结的历史信息，保存到raw_infos
+    record_system_prompt = """
+    You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`.

 **Important Considerations:**

-1. Minimize Information Loss: While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification.
+1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification.

-2. Avoid Redundancy: Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included.
+2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included.

-3. Utility Focus: Only record information that is likely to be useful for completing the user's original instruction. Ask yourself: "How might this information contribute to the AI agent achieving its goal?" Prefer more information over less, as long as it remains relevant to the user's request.
+3. **Utility Focus:** Only record information that is likely to be useful for completing the user's original instruction. Ask yourself: "How might this information contribute to the AI agent achieving its goal?" Prefer more information over less, as long as it remains relevant to the user's request.

-4. Include Source Information: When summarizing information extracted from a specific source (like a webpage or article), always include the source title and URL if available. This helps in verifying the information and providing context.
+4. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`.
+
+5. **Thinking and Report Structure:**  For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information.
+
+**Output Format:**
+
+Provide your output as a JSON formatted list. Each item in the list must adhere to the following format:
+
+```json
+[
+  {
+    "url": "source_url_1",
+    "title": "source_title_1",
+    "summary_content": "concise_summary_of_content_from_source_1",
+    "thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic."
+  },
+  // ... more entries
+  {
+    "url": "unknown",
+    "title": "unknown",
+    "summary_content": "concise_summary_of_content_without_clear_source",
+    "thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected."
+  }
+]
+```

-Format: Provide your output as a textual summary. When source information is available, you must use the format: **[title](url): summarized content**. If no specific source is identified, just provide the summary. No JSON or other structured output is needed beyond this format.
 **Inputs:**

-1.  **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful.
-2.  **Current Search Results:** Textual data gathered from the most recent search query.
-3.  **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string. This string might be empty if no information has been recorded yet.
-"""
-record_messages = [SystemMessage(content=record_system_prompt)]
+1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
+2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string.
+3. **Current Search Results:** Textual data gathered from the most recent search query.
+    """
+    record_messages = [SystemMessage(content=record_system_prompt)]

-browser = Browser(
-    config=BrowserConfig(
-        disable_security=True,
-        headless=False, # Set to False to see browser actions
+    browser = Browser(
+        config=BrowserConfig(
+            disable_security=True,
+            headless=False,  # Set to False to see browser actions
+        )
    )
-)
-controller = CustomController()
-
-
-async def deep_research():
-    global search_infos
-    global search_messages
-    global record_messages
-    global browser
-    global task
-    global llm
-    global save_dir
+    controller = CustomController()

    search_iteration = 0
-    max_search_iterations = 4 # Limit search iterations to prevent infinite loop
-    use_vision = True
+    max_search_iterations = 4  # Limit search iterations to prevent infinite loop
+    use_vision = False

    history_query = []
+    history_infos = []
    try:
        while search_iteration < max_search_iterations:
            search_iteration += 1
-            print(f"开始第 {search_iteration} 轮搜索...")
-            previous_queries = ""
+            print(f"Start {search_iteration}th Search...")
+            history_queries = ""
            for i in range(len(history_query)):
-                previous_queries += f"{i+1}. {history_query[i]}\n" 
-            query_prompt = f"User Instruction:{task} \n Previous Queries: {previous_queries} \n Previous Search Results:\n {search_infos}"
+                history_queries += f"{i + 1}. {history_query[i]}\n"
+            history_infos_ = json.dumps(history_infos, indent=4)
+            query_prompt = f"User Instruction:{task} \n Previous Queries: {history_queries} \n Previous Search Results:\n {history_infos_}"
            search_messages.append(HumanMessage(content=query_prompt))
            ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:])
            if hasattr(ai_query_msg, "reasoning_content"):
                print("🤯 Start Search Deep Thinking: ")
                print(ai_query_msg.reasoning_content)
                print("🤯 End Search Deep Thinking")
-            ai_content = ai_query_msg.content.replace("```json", "").replace("```", "")
-            ai_content = repair_json(ai_content)
-            query_tasks = json.loads(ai_content)
+            ai_query_content = ai_query_msg.content.replace("```json", "").replace("```", "")
+            ai_query_content = repair_json(ai_query_content)
+            ai_query_content = json.loads(ai_query_content)
+            query_plan = ai_query_content["plan"]
+            print("Current Planing:")
+            print(query_plan)
+            query_tasks = ai_query_content["queries"]
            if not query_tasks:
                break
            else:
                history_query.extend(query_tasks)
+                print("Query tasks:")
+                print(query_tasks)
                search_messages.append(ai_query_msg)
-            print(f"搜索关键词/问题: {query_tasks}")

            # 2. Perform Web Search and Auto exec
-            agents = [CustomAgent(task=task + ". Please click on the most relevant link to get information and go deeper, instead of just staying on the search page.", 
-                                  llm=llm_bu, 
-                                  browser=browser, 
-                                  use_vision=use_vision,
-                                  system_prompt_class=CustomSystemPrompt,
-                                  agent_prompt_class=CustomAgentMessagePrompt,
-                                  max_actions_per_step=5,
-                                  controller=controller
-                                  ) for task in query_tasks]
+            agents = [CustomAgent(
+                task=task + ". Please click on the most relevant link to get information and go deeper, instead of just staying on the search page.",
+                llm=llm_bu,
+                browser=browser,
+                use_vision=use_vision,
+                system_prompt_class=CustomSystemPrompt,
+                agent_prompt_class=CustomAgentMessagePrompt,
+                max_actions_per_step=5,
+                controller=controller
+            ) for task in query_tasks]
            query_results = await asyncio.gather(*[agent.run(max_steps=5) for agent in agents])
-            
-            # 3. Summarize Search Result
-            cur_search_rets = ""
-            for i in range(len(query_tasks)):
-                cur_search_rets += f"{i+1}. {query_tasks[i]}\n {query_results[i].final_result()}\n"
-            record_prompt = f"User Instruction:{task}. \n Current Search Results: {cur_search_rets}\n Previous Search Results:\n {search_infos}"
-            record_messages.append(HumanMessage(content=record_prompt))
-            ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
-            if hasattr(ai_record_msg, "reasoning_content"):
-                print("🤯 Start Record Deep Thinking: ")
-                print(ai_record_msg.reasoning_content)
-                print("🤯 End Record Deep Thinking")
-            record_content = ai_record_msg.content
-            search_infos += record_content + "\n"
-            record_messages.append(ai_record_msg)
-            print(search_infos)

-        print("\n搜索完成, 开始生成报告...")
+            # 3. Summarize Search Result
+            query_result_dir = os.path.join(save_dir, "query_results")
+            os.makedirs(query_result_dir, exist_ok=True)
+            for i in range(len(query_tasks)):
+                query_result = query_results[i].final_result()
+                with open(os.path.join(query_result_dir, f"{search_iteration}-{i}.md"), "w") as fw:
+                    fw.write(f"Query: {query_tasks[i]}\n")
+                    fw.write(query_result)
+                history_infos_ = json.dumps(history_infos, indent=4)
+                record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {json.dumps(history_infos_)} \n Current Search Results: {query_result}\n "
+                record_messages.append(HumanMessage(content=record_prompt))
+                ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
+                if hasattr(ai_record_msg, "reasoning_content"):
+                    print("🤯 Start Record Deep Thinking: ")
+                    print(ai_record_msg.reasoning_content)
+                    print("🤯 End Record Deep Thinking")
+                record_content = ai_record_msg.content
+                record_content = repair_json(record_content)
+                new_record_infos = json.loads(record_content)
+                history_infos.extend(new_record_infos)
+                record_messages.append(ai_record_msg)
+
+        print("\nFinish Searching, Start Generating Report...")

        # 5. Report Generation in Markdown (or JSON if you prefer)
        writer_system_prompt = """
-        create polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided.  Please write the report using Markdown format, ensuring it is both informative and visually appealing.
+        You are a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing.
+
+**Specific Instructions:**

-Specific Instructions:
 *   **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression.
-*   **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read.  Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
-*   **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report. 
+*   **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
+*   **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report.
 *   **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format.
 *   **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity.
 *   **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness.
-*   **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions.  Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
-*   **Output Final Report Only Instruction:** This new instruction is explicitly added at the end to directly address the user's requirement.  It clearly commands the LLM to output *only* the final article and to avoid any other elements. The bolded emphasis further reinforces this crucial requirement.
-*   **Reference List Formatting: ** The reference list at the end must be formatted as follows: [1] Title (URL, if available) [2] Title2 (URL2, if available) etc.
-**Output Final Report Only.**
+*   **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
+*   **Reference List Formatting:** The reference list at the end must be formatted as follows: `[1] Title (URL, if available)`.
+*   **ABSOLUTE FINAL OUTPUT RESTRICTION:**  **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).**  **Your response will be deemed a failure if this instruction is not followed precisely.**
+        
+**Inputs:**
+
+1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
+3. **Search Information:** Information gathered from the recent search queries.
        """
-        report_prompt = f"User Instruction:{task} \n Search Information:\n {search_infos}"
-        report_messages = [SystemMessage(content=writer_system_prompt), HumanMessage(content=report_prompt)] # New context for report generation
+        with open(os.path.join(save_dir, "record_infos.json"), "w") as fw:
+            json.dump(history_infos, fw)
+        history_infos_ = json.dumps(history_infos, indent=4)
+        report_prompt = f"User Instruction:{task} \n Search Information:\n {history_infos_}"
+        report_messages = [SystemMessage(content=writer_system_prompt),
+                           HumanMessage(content=report_prompt)]  # New context for report generation
        ai_report_msg = llm.invoke(report_messages)
        if hasattr(ai_report_msg, "reasoning_content"):
            print("🤯 Start Report Deep Thinking: ")
@@ -193,18 +237,14 @@ Specific Instructions:
        report_content = ai_report_msg.content

        if report_content:
-            report_file_path = os.path.join(save_dir, "result.md")
+            report_file_path = os.path.join(save_dir, "final_report.md")
            with open(report_file_path, "w", encoding="utf-8") as f:
                f.write(report_content)
            print(f"报告已生成并保存到: {report_file_path}")

-            print("\nFinal Result: (Report Content)")
-            pprint(report_content, indent=4) # Print the final report content
-
        else:
            print("未能生成报告内容。")

-
    except Exception as e:
        print(f"Deep research 过程中发生错误: {e}")
    finally:
@@ -212,5 +252,6 @@ Specific Instructions:
            await browser.close()
            print("Browser closed.")

+
 if __name__ == "__main__":
    asyncio.run(deep_research())