mirror of
https://github.com/YerbaPage/LongCodeZip.git
synced 2025-10-22 23:19:46 +03:00
51 lines
1.6 KiB
Python
51 lines
1.6 KiB
Python
# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import gzip
|
|
import json
|
|
import os
|
|
|
|
import tempdir
|
|
import wget
|
|
from appdirs import user_cache_dir
|
|
|
|
CACHE_DIR = user_cache_dir("repoqa")
|
|
|
|
REPOQA_DATA_OVERRIDE_PATH = os.getenv("REPOQA_DATA_OVERRIDE_PATH", None)
|
|
REPOQA_DATA_VERSION = os.getenv("REPOQA_DATA_VERSION", "2024-06-23")
|
|
|
|
|
|
def _get_repoqa_data_ready_path() -> str:
|
|
if REPOQA_DATA_OVERRIDE_PATH:
|
|
assert os.path.exists(
|
|
REPOQA_DATA_OVERRIDE_PATH
|
|
), f"File not found: {REPOQA_DATA_OVERRIDE_PATH}"
|
|
return REPOQA_DATA_OVERRIDE_PATH
|
|
|
|
gzip_url = f"https://github.com/evalplus/repoqa_release/releases/download/{REPOQA_DATA_VERSION}/repoqa-{REPOQA_DATA_VERSION}.json.gz"
|
|
cache_path = os.path.join(CACHE_DIR, f"repoqa-{REPOQA_DATA_VERSION}.json")
|
|
# Check if human eval file exists in CACHE_DIR
|
|
if not os.path.exists(cache_path):
|
|
# Install HumanEval dataset and parse as json
|
|
print(f"Downloading dataset from {gzip_url}")
|
|
with tempdir.TempDir() as tmpdir:
|
|
gzip_path = os.path.join(tmpdir, f"data.json.gz")
|
|
wget.download(gzip_url, gzip_path)
|
|
|
|
with gzip.open(gzip_path, "rb") as f:
|
|
repoqa_data = f.read().decode("utf-8")
|
|
|
|
# create CACHE_DIR if not exists
|
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
# Write the original human eval file to CACHE_DIR
|
|
with open(cache_path, "w") as f:
|
|
f.write(repoqa_data)
|
|
|
|
return cache_path
|
|
|
|
|
|
def get_repoqa_data():
|
|
with open(_get_repoqa_data_ready_path(), "r") as f:
|
|
return json.load(f)
|