other fixes to webeval

2026-06-10 02:54:01 +08:00 · 2025-11-25 20:47:34 -08:00
parent 3fa15aa99a
commit f71083c9bd
4 changed files with 119 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ Then you can iterative query it with:
 fara-cli --task "whats the weather in new york now"
 ```

-Hint: might need to do `--tensor-parallel-size 2` if you run out of memory
+Hint: might need to do `--tensor-parallel-size 2` with vllm command if you run out of memory


 ### What Makes Fara-7B Unique
@@ -280,10 +280,12 @@ Navigate to the scripts directory:
 cd webeval/scripts
 ```

+Make sure you set a valid OpenAI GPT-4o endpoint in `endpoint_configs_gpt4o/dev` in order to run the WebVoyager LLM-as-a-judge! 
+
 **Option 1: Self-hosted VLLM**

 ```bash
-python webvoyager.py --model_url ../../model_checkpoints/fara-7b/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /data/data/Fara/eval --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
+python webvoyager.py --model_url /path/where/you/want/to/download/model/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
 ```

 **Option 2: Azure Foundry Deployment**
@@ -291,14 +293,16 @@ python webvoyager.py --model_url ../../model_checkpoints/fara-7b/ --model_port 5
 Deploy [Fara-7B on Foundry endpoint(s)](https://ai.azure.com/explore/models/Fara-7B/version/2/registry/azureml-msr), then place endpoint URLs and keys in JSONs under `endpoint_configs/`:

 ```bash
-python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /data/data/Fara/eval --processes 1 --run_id 1_endpoint --max_rounds 100
+python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --processes 1 --run_id 1_endpoint --max_rounds 100
 ```

 ### Notes

+
 - We use the same LLM-as-a-judge prompts and model (GPT-4o) as WebVoyager, hence the `--eval_oai_config` argument
 - Set `--browserbase` for browser session management (requires exported API key and project ID environment variables)
 - Avoid overloading a single VLLM deployment with more than ~10 concurrent processes due to known issues
+- See debugging output in `fara/webeval/scripts/stdout.txt`

 ---

--- a/src/fara/vllm/az_vllm.py
+++ b/src/fara/vllm/az_vllm.py
@@ -8,8 +8,9 @@ import shutil
 import subprocess
 from pathlib import Path
 from typing import Optional
+import logging

-from vllm_facade import VLLM, Status
+from .vllm_facade import VLLM, Status

 try:
    from aztool.azcp import AzFolder, LocalFolder
@@ -17,11 +18,49 @@ except ImportError:  # keep old behaviour when aztool missing
    AzFolder = None
    LocalFolder = None

+try:
+    from huggingface_hub import snapshot_download
+except ImportError:
+    snapshot_download = None
+
+# Hardcoded HuggingFace model ID for automatic download
+DEFAULT_HF_MODEL_ID = "microsoft/Fara-7B"
+

 def _is_azure_blob_url(model_path: str) -> bool:
    return model_path.startswith(("https://", "http://")) and "blob.core.windows.net" in model_path


+def _download_model_from_hf(output_dir: Path, model_id: str = DEFAULT_HF_MODEL_ID) -> str:
+    """Download model from HuggingFace Hub if not already present."""
+    if snapshot_download is None:
+        raise ImportError(
+            "huggingface_hub is required for automatic model download. "
+            "Install it with: pip install huggingface_hub"
+        )
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    logging.info(f"Downloading {model_id} from HuggingFace to {output_dir}")
+    logging.info("This may take a while depending on your internet connection...")
+
+    try:
+        snapshot_download(
+            repo_id=model_id,
+            local_dir=str(output_dir),
+            local_dir_use_symlinks=False,
+        )
+        logging.info(f"Successfully downloaded model to {output_dir}")
+        return str(output_dir.resolve())
+    except Exception as e:
+        logging.error(f"Error downloading model: {e}")
+        logging.error("If you're getting authentication errors, you may need to:")
+        logging.error("  1. Install huggingface-cli: pip install -U huggingface_hub")
+        logging.error("  2. Login: huggingface-cli login")
+        logging.error("  3. Or set HF_TOKEN environment variable")
+        raise
+
+
 def _extract_model_name(model_url: str) -> str:
    """Extract model name from URL for consistent naming."""
    url_parts = model_url.rstrip('/').split('/')
@@ -100,8 +139,12 @@ class AzVllm:
                # It's a local directory
                model_path = Path(model_url).expanduser()
                if not model_path.exists():
-                    raise FileNotFoundError(f"Local model directory not found: {model_url}")
-                self.local_model_path = str(model_path.resolve())
+                    # Auto-download from HuggingFace if path doesn't exist
+                    logging.warning(f"Local model directory not found: {model_url}")
+                    logging.info(f"Attempting to download {DEFAULT_HF_MODEL_ID} from HuggingFace...")
+                    self.local_model_path = _download_model_from_hf(model_path)
+                else:
+                    self.local_model_path = str(model_path.resolve())
            self.port = port

    def __enter__(self):
--- a/webeval/scripts/webvoyager.py
+++ b/webeval/scripts/webvoyager.py
@@ -6,7 +6,7 @@ import os
 import logging
 # from aztool.workspace import Workspace, AIF_WORKSPACE
 import mlflow
-from eval_exp import EvalExp, ModelReference, get_default_vllm_model_config, get_foundry_endpoint_configs
+from eval_exp import EvalExp, ModelReference, get_foundry_endpoint_configs
 from webeval.oai_clients.graceful_client import GracefulRetryClient
 from webeval.eval_result import EvalResult, Stage
 from arg_parsing import get_eval_args
@@ -62,7 +62,12 @@ def main():
            mlflow.log_param('using_external_endpoint', True)
            mlflow.log_param('endpoint_config_path', ','.join([x['base_url'] for x in websurfer_client_cfg]))
        else:
-            websurfer_client_cfg = get_default_vllm_model_config(args.model_port)
+            # For local VLLM, use a simple flat config structure that FaraAgent expects
+            websurfer_client_cfg = {
+                "api_key": "NONE",
+                "model": "gpt-4o-mini-2024-07-18",
+                "base_url": f"http://0.0.0.0:{args.model_port}/v1"
+            }
            if args.web_surfer_client_cfg is not None:
                websurfer_client_cfg = args.web_surfer_client_cfg        
        if args.web_surfer_kwargs:
--- a/webeval/src/webeval/trajectory.py
+++ b/webeval/src/webeval/trajectory.py
@@ -74,7 +74,52 @@ def remap_action_names(action_name: str) -> str:
        return 'terminate'
    else:
        return action_name  # Return as is if no remapping is needed
-    
+
+def parse_text_based_event(event: Dict) -> Dict | None:
+    """
+    Parse events where thoughts and actions are embedded as text in the message field.
+
+    Expected format:
+    "Thought #X: <thought text>
+    Action #X: executing tool '<tool_name>' with arguments {<json>}"
+
+    Returns a dict with 'action' and 'arguments' fields, or None if not a thought/action event.
+    """
+    import re
+
+    message = event.get('message', '')
+
+    # Check if this is a thought/action message
+    if 'Thought #' not in message or 'Action #' not in message:
+        return None
+
+    try:
+        # Extract thought
+        thought_match = re.search(r'Thought #\d+:\s*(.+?)(?=\nAction #)', message, re.DOTALL)
+        thought = thought_match.group(1).strip() if thought_match else ""
+
+        # Extract action arguments JSON
+        # Pattern: executing tool '<tool_name>' with arguments {json}
+        action_match = re.search(r'with arguments\s+(\{.+\})', message, re.DOTALL)
+        if not action_match:
+            return None
+
+        arguments = json.loads(action_match.group(1))
+
+        # Add thoughts to arguments
+        arguments['thoughts'] = thought
+
+        # Get action name from arguments
+        action_name = arguments.get('action', 'unknown')
+
+        return {
+            'action': action_name,
+            'arguments': arguments
+        }
+    except (json.JSONDecodeError, AttributeError):
+        # Failed to parse - return None
+        return None
+
 class Trajectory:
    def __init__(self, path, gpt_solver = False, skip_web_surfer_log = False):
        self.path = Path(path)
@@ -102,9 +147,21 @@ class Trajectory:
                metadata = json.load(f)
                self.is_action = metadata.get("is_action", False)
        
+        # Check if events are text-based (no 'action' field in any event)
+        has_structured_events = any(e.get('action') is not None for e in self.events)
+
+        if not has_structured_events and self.events:
+            # Parse text-based events
+            parsed_events = []
+            for event in self.events:
+                parsed = parse_text_based_event(event)
+                if parsed:
+                    parsed_events.append(parsed)
+            self.events = parsed_events
+
        if gpt_solver:
            #  remove non-WebSurfer events e.g. WebSurfer-SummarizedAction and other miscellaneous comments from solving pipeline
-            self.events = [e for e in self.events if (e.get('source', None) == "WebSurfer" and e.get('action', None) is not None)] 
+            self.events = [e for e in self.events if (e.get('source', None) == "WebSurfer" and e.get('action', None) is not None)]
            # For gpt_solver, normalize events to have action in arguments for compatibility
            for event in self.events:
                if event.get('action') and 'arguments' in event and isinstance(event['arguments'], dict):