Spaces:
Sleeping
Sleeping
| """ | |
| Pre-submission validation script for the OpenEnv Code Debug environment. | |
| Checks all items on the hackathon pre-submission checklist. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import subprocess | |
| from pathlib import Path | |
| import httpx | |
| HF_SPACE_URL = "https://arnavk-openenv-code-debugger.hf.space" | |
| ROOT = Path(__file__).parent | |
| PASS = "\033[92m[PASS]\033[0m" | |
| FAIL = "\033[91m[FAIL]\033[0m" | |
| WARN = "\033[93m[WARN]\033[0m" | |
| results = [] | |
| def check(name, ok, detail=""): | |
| status = PASS if ok else FAIL | |
| print(f"{status} {name}" + (f" — {detail}" if detail else "")) | |
| results.append(ok) | |
| # ------------------------------------------------------------------ | |
| # 1. Local file structure | |
| # ------------------------------------------------------------------ | |
| print("\n=== File Structure ===") | |
| check("inference.py at root", (ROOT / "inference.py").exists()) | |
| check("Dockerfile at root", (ROOT / "Dockerfile").exists()) | |
| check("openenv.yaml exists", (ROOT / "code_debug_env" / "openenv.yaml").exists()) | |
| check("models.py exists", (ROOT / "code_debug_env" / "models.py").exists()) | |
| check("server/app.py exists", (ROOT / "code_debug_env" / "server" / "app.py").exists()) | |
| check("server/environment.py exists", (ROOT / "code_debug_env" / "server" / "environment.py").exists()) | |
| check("server/executor.py exists", (ROOT / "code_debug_env" / "server" / "executor.py").exists()) | |
| tasks_dir = ROOT / "code_debug_env" / "tasks" | |
| task_files = list(tasks_dir.rglob("*.json")) | |
| check("3+ task files", len(task_files) >= 3, f"{len(task_files)} found") | |
| difficulties = set() | |
| for tf in task_files: | |
| t = json.loads(tf.read_text()) | |
| difficulties.add(t.get("difficulty")) | |
| check("All 3 difficulty levels present", {"easy", "medium", "hard"}.issubset(difficulties), str(difficulties)) | |
| # ------------------------------------------------------------------ | |
| # 2. inference.py content checks | |
| # ------------------------------------------------------------------ | |
| print("\n=== inference.py Content ===") | |
| inf = (ROOT / "inference.py").read_text() | |
| check("Uses OpenAI client", "from openai import OpenAI" in inf) | |
| check("Reads API_BASE_URL from env", "API_BASE_URL" in inf) | |
| check("Reads MODEL_NAME from env", "MODEL_NAME" in inf) | |
| check("Reads HF_TOKEN from env", "HF_TOKEN" in inf) | |
| check("[START] log line", "[START]" in inf) | |
| check("[STEP] log line", "[STEP]" in inf) | |
| check("[END] log line", "[END]" in inf) | |
| # ------------------------------------------------------------------ | |
| # 3. HF Space liveness | |
| # ------------------------------------------------------------------ | |
| print("\n=== HF Space Liveness ===") | |
| try: | |
| r = httpx.get(f"{HF_SPACE_URL}/health", timeout=30) | |
| check("HF Space returns HTTP 200", r.status_code == 200, f"status={r.status_code}") | |
| data = r.json() | |
| check("Health response is healthy", data.get("status") == "healthy", str(data)) | |
| except Exception as e: | |
| check("HF Space reachable", False, str(e)) | |
| # ------------------------------------------------------------------ | |
| # 4. reset() responds | |
| # ------------------------------------------------------------------ | |
| print("\n=== reset() / step() Endpoints ===") | |
| try: | |
| r = httpx.post(f"{HF_SPACE_URL}/reset", json={}, timeout=30) | |
| check("POST /reset returns 200", r.status_code == 200) | |
| ep = r.json().get("episode_id") | |
| check("reset() returns episode_id", bool(ep)) | |
| obs = r.json().get("observation", {}) | |
| check("observation has buggy_code", bool(obs.get("buggy_code"))) | |
| check("observation has test_descriptions", bool(obs.get("test_descriptions"))) | |
| check("observation has difficulty", bool(obs.get("difficulty"))) | |
| except Exception as e: | |
| check("reset() works", False, str(e)) | |
| ep = None | |
| # ------------------------------------------------------------------ | |
| # 5. step() returns reward in 0.0–1.0 | |
| # ------------------------------------------------------------------ | |
| if ep: | |
| try: | |
| r2 = httpx.post(f"{HF_SPACE_URL}/step/{ep}", | |
| json={"action": {"code": "def placeholder(): pass"}}, | |
| timeout=30) | |
| check("POST /step returns 200", r2.status_code == 200) | |
| d = r2.json() | |
| reward = d.get("reward", -1) | |
| check("reward in [0.0, 1.0]", 0.0 <= reward <= 1.0, f"reward={reward}") | |
| check("done field is bool", isinstance(d.get("done"), bool)) | |
| except Exception as e: | |
| check("step() works", False, str(e)) | |
| # ------------------------------------------------------------------ | |
| # 6. state() endpoint | |
| # ------------------------------------------------------------------ | |
| if ep: | |
| try: | |
| r3 = httpx.get(f"{HF_SPACE_URL}/state/{ep}", timeout=30) | |
| check("GET /state returns 200", r3.status_code == 200) | |
| s = r3.json() | |
| check("state has episode_id", bool(s.get("episode_id"))) | |
| check("state has step_count", "step_count" in s) | |
| except Exception as e: | |
| check("state() works", False, str(e)) | |
| # ------------------------------------------------------------------ | |
| # 7. Tasks enumeration | |
| # ------------------------------------------------------------------ | |
| print("\n=== Task Enumeration ===") | |
| try: | |
| r4 = httpx.get(f"{HF_SPACE_URL}/tasks", timeout=30) | |
| check("GET /tasks returns 200", r4.status_code == 200) | |
| tasks = r4.json() | |
| check("3+ tasks listed", len(tasks) >= 3, f"{len(tasks)} tasks") | |
| task_difficulties = {t["difficulty"] for t in tasks} | |
| check("All difficulties present in tasks endpoint", {"easy","medium","hard"}.issubset(task_difficulties)) | |
| except Exception as e: | |
| check("tasks endpoint works", False, str(e)) | |
| # ------------------------------------------------------------------ | |
| # 8. inference.py log format check (dry run on one task) | |
| # ------------------------------------------------------------------ | |
| print("\n=== inference.py Log Format ===") | |
| env = os.environ.copy() | |
| env.update({ | |
| "API_BASE_URL": "https://router.huggingface.co/v1", | |
| "MODEL_NAME": "Qwen/Qwen2.5-72B-Instruct", | |
| "HF_TOKEN": os.getenv("HF_TOKEN", ""), | |
| "ENV_URL": HF_SPACE_URL, | |
| }) | |
| try: | |
| proc = subprocess.run( | |
| [sys.executable, str(ROOT / "inference.py")], | |
| capture_output=True, text=True, timeout=300, env=env | |
| ) | |
| output = proc.stdout | |
| has_start = any(line.startswith("[START]") for line in output.splitlines()) | |
| has_step = any(line.startswith("[STEP]") for line in output.splitlines()) | |
| has_end = any(line.startswith("[END]") for line in output.splitlines()) | |
| check("[START] line emitted", has_start) | |
| check("[STEP] line emitted", has_step) | |
| check("[END] line emitted", has_end) | |
| check("inference.py exits cleanly", proc.returncode == 0, f"exit={proc.returncode}") | |
| if proc.returncode != 0 and proc.stderr: | |
| print(f" stderr: {proc.stderr[:300]}") | |
| except subprocess.TimeoutExpired: | |
| check("inference.py completes within 5 min", False, "timed out") | |
| except Exception as e: | |
| check("inference.py runs", False, str(e)) | |
| # ------------------------------------------------------------------ | |
| # Summary | |
| # ------------------------------------------------------------------ | |
| print("\n=== Summary ===") | |
| passed = sum(results) | |
| total = len(results) | |
| print("".join(["PASS" if r else "FAIL" for r in results])) | |
| print(f"{passed}/{total} checks passed") | |
| if passed == total: | |
| print("\n[READY] All checks passed - ready to submit!") | |
| else: | |
| print(f"\n[ACTION NEEDED] Fix {total - passed} failing check(s) before submitting.") | |