Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich Cursor commited on 11 days ago

Commit

1fd03de

1 Parent(s): b430299

Retry straggler shards on poll-deadline instead of failing the submission

When the Space-side shard poll window elapses with shards still
non-terminal (typically the tail shard stuck QUEUED waiting for
a10g-large GPU capacity, not a compute failure), cancel and re-dispatch
those stragglers and reset the window, up to SHARD_DEADLINE_RETRY_ROUNDS
(2) rounds, before failing. All-or-nothing is preserved; shard uploads
are idempotent so a re-dispatch is safe. Fixes large (full-81) runs that
intermittently failed with "Space-side poll deadline exceeded (2700s)".

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show

submit.py +74 -1

submit.py CHANGED Viewed

@@ -242,6 +242,14 @@ SHARD_MAX_RETRIES = 1
 # vs. the per-shard ceiling because queued shards (past the ~8
 # concurrent slots) wait their turn before their own timeout starts.
 SHARD_POLL_DEADLINE_SECONDS = 45 * 60
 # One HfApi client per process. HF_TOKEN is picked up from the env at
 # construction time and reused for every call.
@@ -1312,6 +1320,31 @@ def _dispatch_shard(
     )
 def _poll_shards_until_done(
     submission_id: str,
     submission_blob_url: str,
@@ -1327,8 +1360,16 @@ def _poll_shards_until_done(
     after their retries (empty list means every shard COMPLETED).
     Transient ``inspect_job`` failures retry up to
     :data:`JOB_POLL_MAX_CONSECUTIVE_ERRORS` before raising.
     """
     deadline = time.monotonic() + SHARD_POLL_DEADLINE_SECONDS
     consecutive_errors = 0
     last_done = -1
     total = len(shards)
@@ -1393,12 +1434,44 @@ def _poll_shards_until_done(
                     )
         if time.monotonic() >= deadline:
             for shard_id, st in shards.items():
                 if st["stage"] not in ("COMPLETED", "FAILED"):
                     st["stage"] = "FAILED"
                     st["message"] = (
                         f"Space-side poll deadline exceeded "
-                        f"({SHARD_POLL_DEADLINE_SECONDS}s)"
                     )
             break
         time.sleep(JOB_POLL_INTERVAL_SECONDS)

 # vs. the per-shard ceiling because queued shards (past the ~8
 # concurrent slots) wait their turn before their own timeout starts.
 SHARD_POLL_DEADLINE_SECONDS = 45 * 60
+# When the poll window elapses with shards still non-terminal — typically the
+# tail shard stuck QUEUED waiting for GPU (a10g-large) capacity rather than a
+# compute failure — re-dispatch those stragglers and reset the window, up to
+# this many rounds, before giving up. A fresh dispatch after the window can land
+# a freed slot; shard uploads are idempotent (each rewrites its own staging
+# prefix), so a re-dispatch is safe. Worst-case total wait is roughly
+# SHARD_POLL_DEADLINE_SECONDS * (1 + SHARD_DEADLINE_RETRY_ROUNDS).
+SHARD_DEADLINE_RETRY_ROUNDS = 2
 # One HfApi client per process. HF_TOKEN is picked up from the env at
 # construction time and reused for every call.
     )
+def _cancel_shard_job(state: dict[str, Any]) -> None:
+    """Best-effort cancel of a shard's in-flight job before re-dispatch.
+    Used on the poll-deadline retry path so a straggler that is still
+    QUEUED/RUNNING releases its slot and does not keep writing its staging
+    prefix once a replacement is dispatched. Best-effort: a failure is
+    logged and ignored, since shard uploads are idempotent (a stale job
+    only ever overwrites its own prefix with an equivalent result).
+    """
+    job_id = state.get("job_id")
+    if not job_id:
+        return
+    try:
+        from huggingface_hub import cancel_job
+        cancel_job(
+            job_id=job_id,
+            namespace=EVAL_JOB_NAMESPACE,
+            token=_jobs_token(),
+        )
+        logger.info("Cancelled straggler shard job %s before retry", job_id)
+    except Exception as e:  # noqa: BLE001 - cancel is best-effort
+        logger.warning("Could not cancel shard job %s: %s", job_id, e)
 def _poll_shards_until_done(
     submission_id: str,
     submission_blob_url: str,
     after their retries (empty list means every shard COMPLETED).
     Transient ``inspect_job`` failures retry up to
     :data:`JOB_POLL_MAX_CONSECUTIVE_ERRORS` before raising.
+    If the :data:`SHARD_POLL_DEADLINE_SECONDS` window elapses with shards
+    still non-terminal (the GPU-capacity-starvation case, where a tail
+    shard sits QUEUED), those stragglers are cancelled and re-dispatched
+    and the window resets, up to :data:`SHARD_DEADLINE_RETRY_ROUNDS`
+    rounds, before the submission is finally failed. All-or-nothing is
+    preserved: the list is non-empty unless every shard COMPLETED.
     """
     deadline = time.monotonic() + SHARD_POLL_DEADLINE_SECONDS
+    deadline_rounds_left = SHARD_DEADLINE_RETRY_ROUNDS
     consecutive_errors = 0
     last_done = -1
     total = len(shards)
                     )
         if time.monotonic() >= deadline:
+            stragglers = [
+                sid for sid, st in shards.items()
+                if st["stage"] not in ("COMPLETED", "FAILED")
+            ]
+            if stragglers and deadline_rounds_left > 0:
+                deadline_rounds_left -= 1
+                logger.warning(
+                    "Poll deadline (%ds) hit for %s with %d straggler shard(s) "
+                    "%s; re-dispatching (%d round(s) left).",
+                    SHARD_POLL_DEADLINE_SECONDS, submission_id,
+                    len(stragglers), stragglers, deadline_rounds_left,
+                )
+                for sid in stragglers:
+                    st = shards[sid]
+                    _cancel_shard_job(st)
+                    # Give the replacement a fresh ERROR-retry budget too.
+                    st["attempts"] = 0
+                    _dispatch_shard(
+                        submission_id, submission_blob_url, sid, st,
+                    )
+                progress.publish(
+                    submission_id,
+                    progress.RUNNING,
+                    f"GPU capacity was tight — retrying {len(stragglers)} "
+                    f"straggler chunk(s) (round "
+                    f"{SHARD_DEADLINE_RETRY_ROUNDS - deadline_rounds_left} of "
+                    f"{SHARD_DEADLINE_RETRY_ROUNDS})…",
+                )
+                deadline = time.monotonic() + SHARD_POLL_DEADLINE_SECONDS
+                last_done = -1  # force a progress republish on the next sweep
+                continue
             for shard_id, st in shards.items():
                 if st["stage"] not in ("COMPLETED", "FAILED"):
                     st["stage"] = "FAILED"
                     st["message"] = (
                         f"Space-side poll deadline exceeded "
+                        f"({SHARD_POLL_DEADLINE_SECONDS}s) after "
+                        f"{SHARD_DEADLINE_RETRY_ROUNDS} retry round(s)"
                     )
             break
         time.sleep(JOB_POLL_INTERVAL_SECONDS)