From 6777ea255a5145cafb967ba369caf938528a2034 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Mon, 15 Apr 2024 18:47:54 +0800 Subject: [PATCH] Fix https://github.com/xlang-ai/OSWorld/issues/21 ; Update README for multimodal agents; Add badge in README; Add setup.py --- README.md | 20 ++++++++++ mm_agents/README.md | 5 ++- setup.py | 92 +++++++++++++++++++++++++++++++++++++++++++++ setup_vm.py | 2 +- 4 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 setup.py diff --git a/README.md b/README.md index 8fede83..efc57fa 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,26 @@ Discord

+

+ + + + + + + + + + + + + + + +
+

+ + ## 📢 Updates - 2024-04-11: We released our [paper](https://arxiv.org/abs/2404.07972), [environment and benchmark](https://github.com/xlang-ai/OSWorld), and [project page](https://os-world.github.io/). Check it out! diff --git a/mm_agents/README.md b/mm_agents/README.md index ccf95d4..be4c476 100644 --- a/mm_agents/README.md +++ b/mm_agents/README.md @@ -31,7 +31,7 @@ agent = PromptAgent( agent.reset() # say we have an instruction and observation instruction = "Please help me to find the nearest restaurant." -obs = {"screenshot": "path/to/observation.jpg"} +obs = {"screenshot": open("path/to/observation.jpg", 'rb').read()} response, actions = agent.predict( instruction, obs @@ -51,8 +51,9 @@ And the following action spaces: To feed an observation into the agent, you have to maintain the `obs` variable as a dict with the corresponding information: ```python +# continue from the previous code snippet obs = { - "screenshot": "path/to/observation.jpg", + "screenshot": open("path/to/observation.jpg", 'rb').read(), "a11y_tree": "" # [a11y_tree data] } response, actions = agent.predict( diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c4a5c06 --- /dev/null +++ b/setup.py @@ -0,0 +1,92 @@ +import subprocess +import sys + +from setuptools import setup, find_packages +from setuptools.command.install import install + + +class InstallPlaywrightCommand(install): + """Customized setuptools install command that runs 'playwright install'.""" + + def run(self): + # Call the original install command to handle regular installation process + install.run(self) + + # Attempt to run 'playwright install' using subprocess + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + print("Successfully ran 'playwright install'.") + except subprocess.CalledProcessError as e: + print("Failed to run 'playwright install'. Please run 'playwright install' manually.") + print(e) + + +setup( + name="desktop_env", + version="0.1.5", + author="Tianbao Xie, Danyang Zhang, Jixuan Chen, Xiaochuan Li, Siheng Zhao, Ruisheng Cao, Toh Jing Hua, etc.", + author_email="tianbaoxiexxx@gmail.com", + description="The package provides a desktop environment for setting and evaluating desktop automation tasks.", + long_description=open('README.md', encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/xlang-ai/desktop_env", + packages=find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + python_requires='>=3.9', + install_requires=[ + "numpy~=1.24.3", + "Pillow~=10.1.0", + "fabric", + "gymnasium~=0.28.1", + "requests~=2.31.0", + "transformers~=4.35.2", + "torch~=2.1.1", + "accelerate", + "opencv-python~=4.8.1.78", + "matplotlib~=3.7.4", + "pynput~=1.7.6", + "pyautogui~=0.9.54", + "psutil~=5.9.6", + "tqdm~=4.65.0", + "pandas~=2.0.3", + "flask~=3.0.0", + "requests-toolbelt~=1.0.0", + "lxml", + "cssselect", + "xmltodict", + "openpyxl", + "python-docx", + "python-pptx", + "pypdf", + "PyGetWindow", + "rapidfuzz", + "pyacoustid", + "opencv-python", + "ImageHash", + "scikit-image", + "librosa", + "pymupdf", + "chardet", + "playwright", + "formulas", + "pydrive", + "fastdtw", + "odfpy", + "func-timeout", + "beautifulsoup4", + "PyYaml", + "mutagen", + "easyocr", + "borb", + "pypdf2", + "pdfplumber", + "wrapt_timeout_decorator" + ], + cmdclass={ + 'install': InstallPlaywrightCommand, # Use the custom install command + }, +) \ No newline at end of file diff --git a/setup_vm.py b/setup_vm.py index fafdd15..9f6e000 100644 --- a/setup_vm.py +++ b/setup_vm.py @@ -17,7 +17,7 @@ def download_and_unzip_vm(): # Determine the platform and CPU architecture to decide the correct VM image to download if platform.machine() == 'arm64': # macOS with Apple Silicon url = "https://huggingface.co/datasets/xlangai/ubuntu_arm/resolve/main/Ubuntu.zip" - elif platform.machine().lower() == 'amd64': + elif platform.machine().lower() in ['amd64', "x86_64"]: url = "https://huggingface.co/datasets/xlangai/ubuntu_x86/resolve/main/Ubuntu.zip" else: raise Exception("Unsupported platform or architecture")