from utils import * from tools import * from inference import * def extract_json_between_markers(llm_output): # Regular expression pattern to find JSON content between ```json and ``` json_pattern = r"```json(.*?)```" matches = re.findall(json_pattern, llm_output, re.DOTALL) if not matches: # Fallback: Try to find any JSON-like content in the output json_pattern = r"\{.*?\}" matches = re.findall(json_pattern, llm_output, re.DOTALL) for json_string in matches: json_string = json_string.strip() try: parsed_json = json.loads(json_string) return parsed_json except json.JSONDecodeError: # Attempt to fix common JSON issues try: # Remove invalid control characters json_string_clean = re.sub(r"[\x00-\x1F\x7F]", "", json_string) parsed_json = json.loads(json_string_clean) return parsed_json except json.JSONDecodeError: continue # Try next match return None # No valid JSON found def get_score(outlined_plan, latex, reward_model_llm, reviewer_type=None, attempts=3, openai_api_key=None): e = str() for _attempt in range(attempts): try: # todo: have a reward function here # template inherited from the AI Scientist (good work on this prompt Sakana AI team :D) template_instructions = """ Respond in the following format: THOUGHT: REVIEW JSON: ```json ``` In , first briefly discuss your intuitions and reasoning for the evaluation. Detail your high-level arguments, necessary choices and desired outcomes of the review. Do not make generic comments here, but be specific to your current paper. Treat this as the note-taking phase of your review. In , provide the review in JSON format with the following fields in the order: - "Summary": A summary of the paper content and its contributions. - "Strengths": A list of strengths of the paper. - "Weaknesses": A list of weaknesses of the paper. - "Originality": A rating from 1 to 4 (low, medium, high, very high). - "Quality": A rating from 1 to 4 (low, medium, high, very high). - "Clarity": A rating from 1 to 4 (low, medium, high, very high). - "Significance": A rating from 1 to 4 (low, medium, high, very high). - "Questions": A set of clarifying questions to be answered by the paper authors. - "Limitations": A set of limitations and potential negative societal impacts of the work. - "Ethical Concerns": A boolean value indicating whether there are ethical concerns. - "Soundness": A rating from 1 to 4 (poor, fair, good, excellent). - "Presentation": A rating from 1 to 4 (poor, fair, good, excellent). - "Contribution": A rating from 1 to 4 (poor, fair, good, excellent). - "Overall": A rating from 1 to 10 (very strong reject to award quality). - "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute). - "Decision": A decision that has to be one of the following: Accept, Reject. For the "Decision" field, don't use Weak Accept, Borderline Accept, Borderline Reject, or Strong Reject. Instead, only use Accept or Reject. This JSON will be automatically parsed, so ensure the format is precise. """ neurips_form = (""" ## Review Form Below is a description of the questions you will be asked on the review form for each paper and some guidelines on what to consider when answering these questions. When writing your review, please keep in mind that after decisions have been made, reviews and meta-reviews of accepted papers and opted-in rejected papers will be made public. 1. Summary: Briefly summarize the paper and its contributions. This is not the place to critique the paper; the authors should generally agree with a well-written summary. - Strengths and Weaknesses: Please provide a thorough assessment of the strengths and weaknesses of the paper, touching on each of the following dimensions: - Originality: Are the tasks or methods new? Is the work a novel combination of well-known techniques? (This can be valuable!) Is it clear how this work differs from previous contributions? Is related work adequately cited - Quality: Is the submission technically sound? Are claims well supported (e.g., by theoretical analysis or experimental results)? Are the methods used appropriate? Is this a complete piece of work or work in progress? Are the authors careful and honest about evaluating both the strengths and weaknesses of their work - Clarity: Is the submission clearly written? Is it well organized? (If not, please make constructive suggestions for improving its clarity.) Does it adequately inform the reader? (Note that a superbly written paper provides enough information for an expert reader to reproduce its results.) - Significance: Are the results important? Are others (researchers or practitioners) likely to use the ideas or build on them? Does the submission address a difficult task in a better way than previous work? Does it advance the state of the art in a demonstrable way? Does it provide unique data, unique conclusions about existing data, or a unique theoretical or experimental approach? 2. Questions: Please list up and carefully describe any questions and suggestions for the authors. Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation. This can be very important for a productive rebuttal and discussion phase with the authors. 3. Limitations: Have the authors adequately addressed the limitations and potential negative societal impact of their work? If not, please include constructive suggestions for improvement. In general, authors should be rewarded rather than punished for being up front about the limitations of their work and any potential negative societal impact. You are encouraged to think through whether any critical points are missing and provide these as feedback for the authors. 4. Ethical concerns: If there are ethical issues with this paper, please flag the paper for an ethics review. For guidance on when this is appropriate, please review the NeurIPS ethics guidelines. 5. Soundness: Please assign the paper a numerical rating on the following scale to indicate the soundness of the technical claims, experimental and research methodology and on whether the central claims of the paper are adequately supported with evidence. 4: excellent 3: good 2: fair 1: poor 6. Presentation: Please assign the paper a numerical rating on the following scale to indicate the quality of the presentation. This should take into account the writing style and clarity, as well as contextualization relative to prior work. 4: excellent 3: good 2: fair 1: poor 7. Contribution: Please assign the paper a numerical rating on the following scale to indicate the quality of the overall contribution this paper makes to the research area being studied. Are the questions being asked important? Does the paper bring a significant originality of ideas and/or execution? Are the results valuable to share with the broader NeurIPS community. 4: excellent 3: good 2: fair 1: poor 8. Overall: Please provide an "overall score" for this submission. Choices: 10: Award quality: Technically flawless paper with groundbreaking impact on one or more areas of AI, with exceptionally strong evaluation, reproducibility, and resources, and no unaddressed ethical considerations. 9: Very Strong Accept: Technically flawless paper with groundbreaking impact on at least one area of AI and excellent impact on multiple areas of AI, with flawless evaluation, resources, and reproducibility, and no unaddressed ethical considerations. 8: Strong Accept: Technically strong paper with, with novel ideas, excellent impact on at least one area of AI or high-to-excellent impact on multiple areas of AI, with excellent evaluation, resources, and reproducibility, and no unaddressed ethical considerations. 7: Accept: Technically solid paper, with high impact on at least one sub-area of AI or moderate-to-high impact on more than one area of AI, with good-to-excellent evaluation, resources, reproducibility, and no unaddressed ethical considerations. 6: Weak Accept: Technically solid, moderate-to-high impact paper, with no major concerns with respect to evaluation, resources, reproducibility, ethical considerations. 5: Borderline accept: Technically solid paper where reasons to accept outweigh reasons to reject, e.g., limited evaluation. Please use sparingly. 4: Borderline reject: Technically solid paper where reasons to reject, e.g., limited evaluation, outweigh reasons to accept, e.g., good evaluation. Please use sparingly. 3: Reject: For instance, a paper with technical flaws, weak evaluation, inadequate reproducibility and incompletely addressed ethical considerations. 2: Strong Reject: For instance, a paper with major technical flaws, and/or poor evaluation, limited impact, poor reproducibility and mostly unaddressed ethical considerations. 1: Very Strong Reject: For instance, a paper with trivial results or unaddressed ethical considerations 9. Confidence: Please provide a "confidence score" for your assessment of this submission to indicate how confident you are in your evaluation. Choices: 5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully. 4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. 3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked. 2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked. 1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked. You must make sure that all sections are properly created: abstract, introduction, methods, results, and discussion. Points must be reduced from your scores if any of these are missing. """ + template_instructions) if reviewer_type is None: reviewer_type = "" sys = ( "You are an AI researcher who is reviewing a paper that was submitted to a prestigious ML venue. " f"Be critical and cautious in your decision. {reviewer_type}\n" ) + neurips_form scoring = query_model( model_str=f"{reward_model_llm}", system_prompt=sys, openai_api_key=openai_api_key, prompt=( f"Outlined in the following text is the research plan that the machine learning engineer was tasked with building: {outlined_plan}\n\n" f"The following text is the research latex that the model produced: \n{latex}\n\n"), temp=0.0) review_json = extract_json_between_markers(scoring) overall = int(review_json["Overall"]) / 10 soundness = int(review_json["Soundness"]) / 4 confidence = int(review_json["Confidence"]) / 5 contribution = int(review_json["Contribution"]) / 4 presentation = int(review_json["Presentation"]) / 4 clarity = int(review_json["Clarity"]) / 4 originality = int(review_json["Originality"]) / 4 quality = int(review_json["Quality"]) / 4 significance = int(review_json["Significance"]) / 4 clarity_weight = 0.1 quality_weight = 0.1 overall_weight = 1.0 soundness_weight = 0.1 confidence_weight = 0.1 originality_weight = 0.1 significance_weight = 0.1 contribution_weight = 0.4 presentation_weight = 0.2 # max possible max_score = ( clarity_weight + quality_weight + overall_weight + soundness_weight + confidence_weight + originality_weight + significance_weight + contribution_weight + presentation_weight) performance = (( soundness_weight * soundness + presentation_weight * presentation + confidence_weight * confidence + contribution_weight * contribution + overall_weight * overall + originality_weight * originality + significance * significance_weight + clarity_weight * clarity + quality_weight * quality) / max_score) * 10 return performance, f"The performance of your submission is: {performance}" + scoring, True except Exception as e: print(e) return None, str(e), False return 0, e class ReviewersAgent: def __init__(self, model="gpt-4o-mini", notes=None, openai_api_key=None): if notes is None: self.notes = [] else: self.notes = notes self.model = model self.openai_api_key = openai_api_key def inference(self, plan, report): reviewer_1 = "You are a harsh but fair reviewer and expect good experiments that lead to insights for the research topic." review_1 = get_score(outlined_plan=plan, latex=report, reward_model_llm=self.model, reviewer_type=reviewer_1, openai_api_key=self.openai_api_key) reviewer_2 = "You are a harsh and critical but fair fair reviewer who is looking for idea that would be impactful in the field." review_2 = get_score(outlined_plan=plan, latex=report, reward_model_llm=self.model, reviewer_type=reviewer_2, openai_api_key=self.openai_api_key) reviewer_3 = "You are a harsh but fair open-minded reviewer that is looking for novel ideas that have not been proposed before." review_3 = get_score(outlined_plan=plan, latex=report, reward_model_llm=self.model, reviewer_type=reviewer_3, openai_api_key=self.openai_api_key) return f"Reviewer #1:\n{review_1}, \nReviewer #2:\n{review_2}, \nReviewer #3:\n{review_3}" class BaseAgent: def __init__(self, model="gpt-4o-mini", notes=None, max_steps=100, openai_api_key=None): if notes is None: self.notes = [] else: self.notes = notes self.max_steps = max_steps self.model = model self.phases = [] self.plan = str() self.report = str() self.history = list() self.prev_comm = str() self.prev_report = str() self.exp_results = str() self.dataset_code = str() self.results_code = str() self.lit_review_sum = str() self.interpretation = str() self.prev_exp_results = str() self.reviewer_response = str() self.prev_results_code = str() self.prev_interpretation = str() self.openai_api_key = openai_api_key self.second_round = False self.max_hist_len = 15 def set_model_backbone(self, model): self.model = model @staticmethod def clean_text(text): """ Fix minor corrections :return: (str) corrected text """ text = text.replace("```\n", "```") return text def inference(self, research_topic, phase, step, feedback="", temp=None): sys_prompt = f"""You are {self.role_description()} \nTask instructions: {self.phase_prompt(phase)}\n{self.command_descriptions(phase)}"""#\n{self.example_command(phase)} context = self.context(phase) history_str = "\n".join([_[1] for _ in self.history]) phase_notes = [_note for _note in self.notes if phase in _note["phases"]] notes_str = f"Notes for the task objective: {phase_notes}\n" if len(phase_notes) > 0 else "" complete_str = str() if step/(self.max_steps-1) > 0.7: complete_str = "You must finish this task and submit as soon as possible!" prompt = ( f"""{context}\n{'~' * 10}\nHistory: {history_str}\n{'~' * 10}\n""" f"Current Step #{step}, Phase: {phase}\n{complete_str}\n" f"[Objective] Your goal is to perform research on the following topic: {research_topic}\n" f"Feedback: {feedback}\nNotes: {notes_str}\nYour previous command was: {self.prev_comm}. Make sure your new output is very different.\nPlease produce a single command below:\n") model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, temp=temp, openai_api_key=self.openai_api_key) print("^"*50, phase, "^"*50) model_resp = self.clean_text(model_resp) self.prev_comm = model_resp steps_exp = None if feedback is not None and "```EXPIRATION" in feedback: steps_exp = int(feedback.split("\n")[0].replace("```EXPIRATION ", "")) feedback = extract_prompt(feedback, "EXPIRATION") self.history.append((steps_exp, f"Step #{step}, Phase: {phase}, Feedback: {feedback}, Your response: {model_resp}")) # remove histories that have expiration dates for _i in reversed(range(len(self.history))): if self.history[_i][0] is not None: self.history[_i] = self.history[_i] = self.history[_i][0] - 1, self.history[_i][1] if self.history[_i][0] < 0: self.history.pop(_i) if len(self.history) >= self.max_hist_len: self.history.pop(0) return model_resp def reset(self): self.history.clear() # Clear the deque self.prev_comm = "" def context(self, phase): raise NotImplementedError("Subclasses should implement this method.") def phase_prompt(self, phase): raise NotImplementedError("Subclasses should implement this method.") def role_description(self): raise NotImplementedError("Subclasses should implement this method.") def command_descriptions(self, phase): raise NotImplementedError("Subclasses should implement this method.") def example_command(self, phase): raise NotImplementedError("Subclasses should implement this method.") class ProfessorAgent(BaseAgent): def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None): super().__init__(model, notes, max_steps, openai_api_key) self.phases = ["report writing"] def generate_readme(self): sys_prompt = f"""You are {self.role_description()} \n Here is the written paper \n{self.report}. Task instructions: Your goal is to integrate all of the knowledge, code, reports, and notes provided to you and generate a readme.md for a github repository.""" history_str = "\n".join([_[1] for _ in self.history]) prompt = ( f"""History: {history_str}\n{'~' * 10}\n""" f"Please produce the readme below in markdown:\n") model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, openai_api_key=self.openai_api_key) return model_resp.replace("```markdown", "") def context(self, phase): #sr_str = str() #if self.second_round: # sr_str = ( # f"The following are results from the previous experiments\n", # f"Previous Experiment code: {self.prev_results_code}\n" # f"Previous Results: {self.prev_exp_results}\n" # f"Previous Interpretation of results: {self.prev_interpretation}\n" # f"Previous Report: {self.prev_report}\n" # f"{self.reviewer_response}\n\n\n" # ) #if phase == "report writing": # return ( # sr_str, # f"Current Literature Review: {self.lit_review_sum}\n" # f"Current Plan: {self.plan}\n" # f"Current Dataset code: {self.dataset_code}\n" # f"Current Experiment code: {self.results_code}\n" # f"Current Results: {self.exp_results}\n" # f"Current Interpretation of results: {self.interpretation}\n" # ) return "" def example_command(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") return ( "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\n\n``` where COMMAND is the specific command you want to run (e.g. REPORT, DIALOGUE).\n") def command_descriptions(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") return ( "When you believe a good report has been arrived at between you and the PhD student you can use the following command to end the dialogue and submit the plan ```LATEX\nreport here\n```\n where report here is the actual report written in compilable latex to be transmitted and LATEX is just the word LATEX.\n" "Your report should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance. You must propagate this information accurately. You must also submit the report promptly. Do not delay too long.\n" "You must be incredibly detailed about what you did for the experiment and all of the findings.\n" ) def phase_prompt(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") phase_str = ( "You are directing a PhD student to help them write a report in latex based on results from an experiment, and you interact with them through dialogue.\n" "Your goal is to write a report in latex for an experiment. You should read through the code, read through the interpretation, and look at the results to understand what occurred. You should then discuss with the PhD student how they can write up the results and give their feedback to improve their thoughts.\n" ) return phase_str def role_description(self): return "a computer science professor at a top university." class PostdocAgent(BaseAgent): def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None): super().__init__(model, notes, max_steps, openai_api_key) self.phases = ["plan formulation", "results interpretation"] def context(self, phase): sr_str = str() if self.second_round: sr_str = ( f"The following are results from the previous experiments\n", f"Previous Experiment code: {self.prev_results_code}\n" f"Previous Results: {self.prev_exp_results}\n" f"Previous Interpretation of results: {self.prev_interpretation}\n" f"Previous Report: {self.prev_report}\n" f"{self.reviewer_response}\n\n\n" ) if phase == "plan formulation": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}", ) elif phase == "results interpretation": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}\n" f"Current Plan: {self.plan}\n" f"Current Dataset code: {self.dataset_code}\n" f"Current Experiment code: {self.results_code}\n" f"Current Results: {self.exp_results}" ) return "" def example_command(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") return () def command_descriptions(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") if phase == "plan formulation": return ( "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" "When you believe a good plan has been arrived at between you and the PhD student you can use the following command to end the dialogue and submit the plan ```PLAN\nplan here\n```\n where plan here is the actual plan to be transmitted and PLAN is just the word PLAN. Plan here should provide a clear outline for how to achieve the task, including what machine learning models to use and implement, what types of datasets should be searched for and used to train the model, and the exact details of the experiment.\n" "You can only use a SINGLE command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, NOT BOTH.\n" "Make sure not to produce too much dialogue and to submit an plan in reasonable time." "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. PLAN, DIALOGUE).\n" ) elif phase == "results interpretation": return ( "When you believe a good interpretation has been arrived at between you and the PhD student you can use the following command to end the dialogue and submit the plan ```INTERPRETATION\ninterpretation here\n```\n where interpretation here is the actual interpretation to be transmitted and INTERPRETATION is just the word INTERPRETATION. Please provide an INTERPRETATION in a reasonable amount of time.\n" "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" "You must submit the interpretation during this phase in a reasonable amount of time. Do not delay the submission." "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. INTERPRETATION, DIALOGUE).\n" ) def phase_prompt(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") if phase == "plan formulation": phase_str = ( "You are directing a PhD student to help them come up with a good plan, and you interact with them through dialogue.\n" "Your goal is to produce plans that would make good experiments for the given topic. You should aim for a very simple experiment that showcases your plan, not a complex one. You should integrate the provided literature review and come up with plans on how to expand and build on these works for the given topic. Your plans should provide a clear outline for how to achieve the task, including what machine learning models to use and implement, what types of datasets should be searched for and used to train the model, and the exact details of the experiment.\n" ) elif phase == "results interpretation": phase_str = ( "You are directing a PhD student to help them come up with an interpretation for results from an experiment, and you interact with them through dialogue.\n" "Your goal is to interpret results from experiments that were previously run. You should read through the code and look at the results to understand what occurred. You should then discuss with the PhD student how they can interpret the results and give their feedback to improve their thoughts. You should integrate the provided literature review, code, and plans to come up with an exciting interpretation that could make a compelling paper. Your plans should provide a clear outline that can be used to write an academic paper.\n" "Your interpretation should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance. You must propagate this information accurately. You must also complete this in a reasonable amount of time and then submit your results.\n" ) return phase_str def role_description(self): return "a computer science postdoctoral student at a top university." class MLEngineerAgent(BaseAgent): def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None): super().__init__(model, notes, max_steps, openai_api_key) self.phases = [ "data preparation", "running experiments", ] def context(self, phase): sr_str = str() if self.second_round: sr_str = ( f"The following are results from the previous experiments\n", f"Previous Experiment code: {self.prev_results_code}\n" f"Previous Results: {self.prev_exp_results}\n" f"Previous Interpretation of results: {self.prev_interpretation}\n" f"Previous Report: {self.prev_report}\n" f"{self.reviewer_response}\n\n\n" ) if phase == "data preparation": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}\nPlan: {self.plan}", f"Current Plan: {self.plan}") #elif phase == "running experiments": # return ( # sr_str, # f"Current Literature Review: {self.lit_review_sum}\n" # f"Current Plan: {self.plan}\n" # f"Current Dataset code: {self.dataset_code}\n" # ) return "" def example_command(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") return () def command_descriptions(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") if phase == "data preparation": return ( "You can produce code using the following command: ```python\ncode here\n```\n where code here is the actual code you will execute in a Python terminal, and python is just the word python. Try to incorporate some print functions. Do not use any classes or functions. If your code returns any errors, they will be provided to you, and you are also able to see print statements. You will receive all print statement results from the code. Make sure function variables are created inside the function or passed as a function parameter.\n" # Try to avoid creating functions. "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send, and DIALOGUE is just the word DIALOGUE.\n" "You also have access to HuggingFace datasets. You can search the datasets repository using the following command: ```SEARCH_HF\nsearch query here\n``` where search query here is the query used to search HuggingFace datasets, and SEARCH_HF is the word SEARCH_HF. This will return a list of HuggingFace dataset descriptions which can be loaded into Python using the datasets library. Your code MUST use an external HuggingFace directory.\n" "You MUST use a HuggingFace dataset in your code. DO NOT CREATE A MAIN FUNCTION. Try to make the code very simple.\n" "You can only use a SINGLE command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, NOT BOTH.\n" "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. python, DIALOGUE, SEARCH_HF).\n") return () def phase_prompt(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") if phase == "data preparation": phase_str = ( "You are a machine learning engineer being directed by a PhD student who will help you write the code, and you can interact with them through dialogue.\n" "Your goal is to produce code that prepares the data for the provided experiment. You should aim for simple code to prepare the data, not complex code. You should integrate the provided literature review and the plan and come up with code to prepare data for this experiment.\n" ) return phase_str def role_description(self): return "a machine learning engineer working at a top university." class PhDStudentAgent(BaseAgent): def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None): super().__init__(model, notes, max_steps, openai_api_key) self.phases = [ "literature review", "plan formulation", "data preparation", "running experiments", "results interpretation", "report writing", "report refinement", ] self.lit_review = [] def context(self, phase): sr_str = str() if self.second_round: sr_str = ( f"The following are results from the previous experiments\n", f"Previous Experiment code: {self.prev_results_code}\n" f"Previous Results: {self.prev_exp_results}\n" f"Previous Interpretation of results: {self.prev_interpretation}\n" f"Previous Report: {self.prev_report}\n" f"{self.reviewer_response}\n\n\n" ) if phase == "plan formulation": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}",) elif phase == "data preparation": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}\n" f"Current Plan: {self.plan}" ) #elif phase == "running experiments": # return ( # sr_str, # f"Current Literature Review: {self.lit_review_sum}\n" # f"Current Plan: {self.plan}\n" # f"Current Dataset code: {self.dataset_code}\n" # ) elif phase == "results interpretation": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}\n" f"Current Plan: {self.plan}\n" f"Current Dataset code: {self.dataset_code}\n" f"Current Experiment code: {self.results_code}\n" f"Current Results: {self.exp_results}" ) #elif phase == "report writing": # return ( # sr_str, # f"Current Literature Review: {self.lit_review_sum}\n" # f"Current Plan: {self.plan}\n" # f"Current Dataset code: {self.dataset_code}\n" # f"Current Experiment code: {self.results_code}\n" # f"Current Results: {self.exp_results}\n" # f"Current Interpretation of results: {self.interpretation}" # ) elif phase == "report refinement": return ( sr_str, f"Current Literature Review: {self.lit_review_sum}\n" f"Current Plan: {self.plan}\n" f"Current Dataset code: {self.dataset_code}\n" f"Current Experiment code: {self.results_code}\n" f"Current Results: {self.exp_results}\n" f"Current Interpretation of results: {self.interpretation}" ) elif phase == "literature review": return sr_str else: return "" def requirements_txt(self): sys_prompt = f"""You are {self.role_description()} \nTask instructions: Your goal is to integrate all of the knowledge, code, reports, and notes provided to you and generate a requirements.txt for a github repository for all of the code.""" history_str = "\n".join([_[1] for _ in self.history]) prompt = ( f"""History: {history_str}\n{'~' * 10}\n""" f"Please produce the requirements.txt below in markdown:\n") model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, openai_api_key=self.openai_api_key) return model_resp def example_command(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") return () def command_descriptions(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") if phase == "literature review": return ( "To collect paper summaries, use the following command: ```SUMMARY\nSEARCH QUERY\n```\n where SEARCH QUERY is a string that will be used to find papers with semantically similar content and SUMMARY is just the word SUMMARY. Make sure your search queries are very short.\n" "To get the full paper text for an arXiv paper, use the following command: ```FULL_TEXT\narXiv paper ID\n```\n where arXiv paper ID is the ID of the arXiv paper (which can be found by using the SUMMARY command), and FULL_TEXT is just the word FULL_TEXT. Make sure to read the full text using the FULL_TEXT command before adding it to your list of relevant papers.\n" "If you believe a paper is relevant to the research project proposal, you can add it to the official review after reading using the following command: ```ADD_PAPER\narXiv_paper_ID\nPAPER_SUMMARY\n```\nwhere arXiv_paper_ID is the ID of the arXiv paper, PAPER_SUMMARY is a brief summary of the paper, and ADD_PAPER is just the word ADD_PAPER. You can only add one paper at a time. \n" "Make sure to use ADD_PAPER when you see a relevant paper. DO NOT use SUMMARY too many times." "You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n" "Make sure to extensively discuss the experimental results in your summary.\n" "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. ADD_PAPER, FULL_TEXT, SUMMARY). Do not use the word COMMAND make sure to use the actual command, e.g. your command should look exactly like this: ```ADD_PAPER\ntext\n``` (where the command could be from ADD_PAPER, FULL_TEXT, SUMMARY)\n") elif phase == "plan formulation": return ( "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" "You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n" "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. DIALOGUE).\n" ) elif phase == "data preparation": return ( "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" "When you and the ML engineer have finalized your dataset preparation code and are ready to submit the final code, please use the following command: ```SUBMIT_CODE\ncode here\n```\n where 'code here' is the finalized code you will send and SUBMIT_CODE is just the word SUBMIT_CODE. Do not use any classes or functions. The submitted code must have a HuggingFace dataset import and must use an external HuggingFace dataset. If your code returns any errors, they will be provided to you, and you are also able to see print statements. Make sure function variables are created inside the function or passed as a function parameter. DO NOT CREATE A MAIN FUNCTION.\n" "Make sure to submit code in a reasonable amount of time. Do not make the code too complex, try to make it simple. Do not take too long to submit code. Submit the code early. You should submit the code ASAP.\n" "You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n" "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. SUBMIT_CODE, DIALOGUE).\n") elif phase == "results interpretation": return ( "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. DIALOGUE).\n" ) #elif phase == "report writing": # return ( # "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n" # "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. DIALOGUE).\n") elif phase == "report refinement": return "" return "" def phase_prompt(self, phase): if phase not in self.phases: raise Exception(f"Invalid phase: {phase}") if phase == "literature review": phase_str = ( "Your goal is to perform a literature review for the presented task and add papers to the literature review.\n" "You have access to arXiv and can perform two search operations: (1) finding many different paper summaries from a search query and (2) getting a single full paper text for an arXiv paper.\n" ) rev_papers = "Papers in your review so far: " + " ".join([_paper["arxiv_id"] for _paper in self.lit_review]) phase_str += rev_papers if len(self.lit_review) > 0 else "" elif phase == "plan formulation": phase_str = ( "You are a PhD student being directed by a postdoc who will help you come up with a good plan, and you interact with them through dialogue.\n" "Your goal is to produce plans that would make good experiments for the given topic. You should aim for a very simple experiment that showcases your plan, not a complex one. You should integrate the provided literature review and come up with plans on how to expand and build on these works for the given topic. Your plans should provide a clear outline for how to achieve the task, including what machine learning models to use and implement, what types of datasets should be searched for and used to train the model, and the exact details of the experiment.\n" ) elif phase == "data preparation": phase_str = ( "You are a PhD student directing a machine learning engineer, where the machine learning engineer will be writing the code, and you can interact with them through dialogue.\n" "Your goal is to help the ML engineer produce code that prepares the data for the provided experiment. You should aim for very simple code to prepare the data, not complex code. You should integrate the provided literature review and the plan and come up with code to prepare data for this experiment.\n" ) elif phase == "results interpretation": phase_str = ( "You are a PhD student being directed by a postdoc who will help you come up with an interpretation for results from an experiment, and you interact with them through dialogue.\n" "Your goal is to interpret results from experiments that were previously run. You should read through the code and look at the results to understand what occurred. You should then discuss with the postdoc your interpretation and use their feedback to improve your thoughts. You should integrate the provided literature review, code, and plans to come up with an exciting interpretation that could make a compelling paper. Your plans should provide a clear outline that can be used to write an academic paper.\n" "Your interpretation should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance. You must propagate this information accurately.\n" "You must submit the interpretation during this phase in a reasonable amount of time. Do not delay the submission." ) #elif phase == "report writing": # phase_str = ( # "You are a PhD student being directed by a professor who will help you write a report based on results from an experiment, and you interact with them through dialogue.\n" # "Your goal is to write a report for an experiment entirely in latex. You should read through the code, read through the interpretation, and look at the results to understand what occurred. You should then discuss with the professor how you can write up the results and receive their feedback to improve your thoughts.\n" # "Your report should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance in latex. You must propagate this information accurately.\n" # "You must be incredibly detailed about what you did for the experiment and all of the findings.\n" # ) elif phase == "report refinement": phase_str = ( "You are a PhD student who has submitted their paper to an ML conference called ICLR. Your goal was to write a research paper and get high scores from the reviewers so that it get accepted to the conference.\n" ) else: phase_str = "" return phase_str def role_description(self): return "a computer science PhD student at a top university." def add_review(self, review, arx_eng): try: arxiv_id, review_text = review.strip().split("\n", 1) full_text = arx_eng.retrieve_full_paper_text(arxiv_id) review_entry = { "arxiv_id": arxiv_id, "full_text": full_text, "summary": review_text, } self.lit_review.append(review_entry) return f"Successfully added paper {arxiv_id}", full_text except Exception as e: return f"Error trying to add review -- bad formatting, try again: {str(e)}", "" def format_review(self): return "Provided here is a literature review on this topic:\n" + "\n".join( f"arXiv ID: {_l['arxiv_id']}, Summary: {_l['summary']}" for _l in self.lit_review)