""" Pre-submission validation script for the OpenEnv Code Debug environment. Checks all items on the hackathon pre-submission checklist. """ import json import os import sys import subprocess from pathlib import Path import httpx HF_SPACE_URL = "https://arnavk-openenv-code-debugger.hf.space" ROOT = Path(__file__).parent PASS = "\033[92m[PASS]\033[0m" FAIL = "\033[91m[FAIL]\033[0m" WARN = "\033[93m[WARN]\033[0m" results = [] def check(name, ok, detail=""): status = PASS if ok else FAIL print(f"{status} {name}" + (f" — {detail}" if detail else "")) results.append(ok) # ------------------------------------------------------------------ # 1. Local file structure # ------------------------------------------------------------------ print("\n=== File Structure ===") check("inference.py at root", (ROOT / "inference.py").exists()) check("Dockerfile at root", (ROOT / "Dockerfile").exists()) check("openenv.yaml exists", (ROOT / "code_debug_env" / "openenv.yaml").exists()) check("models.py exists", (ROOT / "code_debug_env" / "models.py").exists()) check("server/app.py exists", (ROOT / "code_debug_env" / "server" / "app.py").exists()) check("server/environment.py exists", (ROOT / "code_debug_env" / "server" / "environment.py").exists()) check("server/executor.py exists", (ROOT / "code_debug_env" / "server" / "executor.py").exists()) tasks_dir = ROOT / "code_debug_env" / "tasks" task_files = list(tasks_dir.rglob("*.json")) check("3+ task files", len(task_files) >= 3, f"{len(task_files)} found") difficulties = set() for tf in task_files: t = json.loads(tf.read_text()) difficulties.add(t.get("difficulty")) check("All 3 difficulty levels present", {"easy", "medium", "hard"}.issubset(difficulties), str(difficulties)) # ------------------------------------------------------------------ # 2. inference.py content checks # ------------------------------------------------------------------ print("\n=== inference.py Content ===") inf = (ROOT / "inference.py").read_text() check("Uses OpenAI client", "from openai import OpenAI" in inf) check("Reads API_BASE_URL from env", "API_BASE_URL" in inf) check("Reads MODEL_NAME from env", "MODEL_NAME" in inf) check("Reads HF_TOKEN from env", "HF_TOKEN" in inf) check("[START] log line", "[START]" in inf) check("[STEP] log line", "[STEP]" in inf) check("[END] log line", "[END]" in inf) # ------------------------------------------------------------------ # 3. HF Space liveness # ------------------------------------------------------------------ print("\n=== HF Space Liveness ===") try: r = httpx.get(f"{HF_SPACE_URL}/health", timeout=30) check("HF Space returns HTTP 200", r.status_code == 200, f"status={r.status_code}") data = r.json() check("Health response is healthy", data.get("status") == "healthy", str(data)) except Exception as e: check("HF Space reachable", False, str(e)) # ------------------------------------------------------------------ # 4. reset() responds # ------------------------------------------------------------------ print("\n=== reset() / step() Endpoints ===") try: r = httpx.post(f"{HF_SPACE_URL}/reset", json={}, timeout=30) check("POST /reset returns 200", r.status_code == 200) ep = r.json().get("episode_id") check("reset() returns episode_id", bool(ep)) obs = r.json().get("observation", {}) check("observation has buggy_code", bool(obs.get("buggy_code"))) check("observation has test_descriptions", bool(obs.get("test_descriptions"))) check("observation has difficulty", bool(obs.get("difficulty"))) except Exception as e: check("reset() works", False, str(e)) ep = None # ------------------------------------------------------------------ # 5. step() returns reward in 0.0–1.0 # ------------------------------------------------------------------ if ep: try: r2 = httpx.post(f"{HF_SPACE_URL}/step/{ep}", json={"action": {"code": "def placeholder(): pass"}}, timeout=30) check("POST /step returns 200", r2.status_code == 200) d = r2.json() reward = d.get("reward", -1) check("reward in [0.0, 1.0]", 0.0 <= reward <= 1.0, f"reward={reward}") check("done field is bool", isinstance(d.get("done"), bool)) except Exception as e: check("step() works", False, str(e)) # ------------------------------------------------------------------ # 6. state() endpoint # ------------------------------------------------------------------ if ep: try: r3 = httpx.get(f"{HF_SPACE_URL}/state/{ep}", timeout=30) check("GET /state returns 200", r3.status_code == 200) s = r3.json() check("state has episode_id", bool(s.get("episode_id"))) check("state has step_count", "step_count" in s) except Exception as e: check("state() works", False, str(e)) # ------------------------------------------------------------------ # 7. Tasks enumeration # ------------------------------------------------------------------ print("\n=== Task Enumeration ===") try: r4 = httpx.get(f"{HF_SPACE_URL}/tasks", timeout=30) check("GET /tasks returns 200", r4.status_code == 200) tasks = r4.json() check("3+ tasks listed", len(tasks) >= 3, f"{len(tasks)} tasks") task_difficulties = {t["difficulty"] for t in tasks} check("All difficulties present in tasks endpoint", {"easy","medium","hard"}.issubset(task_difficulties)) except Exception as e: check("tasks endpoint works", False, str(e)) # ------------------------------------------------------------------ # 8. inference.py log format check (dry run on one task) # ------------------------------------------------------------------ print("\n=== inference.py Log Format ===") env = os.environ.copy() env.update({ "API_BASE_URL": "https://router.huggingface.co/v1", "MODEL_NAME": "Qwen/Qwen2.5-72B-Instruct", "HF_TOKEN": os.getenv("HF_TOKEN", ""), "ENV_URL": HF_SPACE_URL, }) try: proc = subprocess.run( [sys.executable, str(ROOT / "inference.py")], capture_output=True, text=True, timeout=300, env=env ) output = proc.stdout has_start = any(line.startswith("[START]") for line in output.splitlines()) has_step = any(line.startswith("[STEP]") for line in output.splitlines()) has_end = any(line.startswith("[END]") for line in output.splitlines()) check("[START] line emitted", has_start) check("[STEP] line emitted", has_step) check("[END] line emitted", has_end) check("inference.py exits cleanly", proc.returncode == 0, f"exit={proc.returncode}") if proc.returncode != 0 and proc.stderr: print(f" stderr: {proc.stderr[:300]}") except subprocess.TimeoutExpired: check("inference.py completes within 5 min", False, "timed out") except Exception as e: check("inference.py runs", False, str(e)) # ------------------------------------------------------------------ # Summary # ------------------------------------------------------------------ print("\n=== Summary ===") passed = sum(results) total = len(results) print("".join(["PASS" if r else "FAIL" for r in results])) print(f"{passed}/{total} checks passed") if passed == total: print("\n[READY] All checks passed - ready to submit!") else: print(f"\n[ACTION NEEDED] Fix {total - passed} failing check(s) before submitting.")