other fixes to webeval

This commit is contained in:
corby
2025-11-25 20:47:34 -08:00
parent 3fa15aa99a
commit f71083c9bd
4 changed files with 119 additions and 10 deletions

View File

@@ -40,7 +40,7 @@ Then you can iterative query it with:
fara-cli --task "whats the weather in new york now"
```
Hint: might need to do `--tensor-parallel-size 2` if you run out of memory
Hint: might need to do `--tensor-parallel-size 2` with vllm command if you run out of memory
### What Makes Fara-7B Unique
@@ -280,10 +280,12 @@ Navigate to the scripts directory:
cd webeval/scripts
```
Make sure you set a valid OpenAI GPT-4o endpoint in `endpoint_configs_gpt4o/dev` in order to run the WebVoyager LLM-as-a-judge!
**Option 1: Self-hosted VLLM**
```bash
python webvoyager.py --model_url ../../model_checkpoints/fara-7b/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /data/data/Fara/eval --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
python webvoyager.py --model_url /path/where/you/want/to/download/model/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
```
**Option 2: Azure Foundry Deployment**
@@ -291,14 +293,16 @@ python webvoyager.py --model_url ../../model_checkpoints/fara-7b/ --model_port 5
Deploy [Fara-7B on Foundry endpoint(s)](https://ai.azure.com/explore/models/Fara-7B/version/2/registry/azureml-msr), then place endpoint URLs and keys in JSONs under `endpoint_configs/`:
```bash
python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /data/data/Fara/eval --processes 1 --run_id 1_endpoint --max_rounds 100
python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --processes 1 --run_id 1_endpoint --max_rounds 100
```
### Notes
- We use the same LLM-as-a-judge prompts and model (GPT-4o) as WebVoyager, hence the `--eval_oai_config` argument
- Set `--browserbase` for browser session management (requires exported API key and project ID environment variables)
- Avoid overloading a single VLLM deployment with more than ~10 concurrent processes due to known issues
- See debugging output in `fara/webeval/scripts/stdout.txt`
---

View File

@@ -8,8 +8,9 @@ import shutil
import subprocess
from pathlib import Path
from typing import Optional
import logging
from vllm_facade import VLLM, Status
from .vllm_facade import VLLM, Status
try:
from aztool.azcp import AzFolder, LocalFolder
@@ -17,11 +18,49 @@ except ImportError: # keep old behaviour when aztool missing
AzFolder = None
LocalFolder = None
try:
from huggingface_hub import snapshot_download
except ImportError:
snapshot_download = None
# Hardcoded HuggingFace model ID for automatic download
DEFAULT_HF_MODEL_ID = "microsoft/Fara-7B"
def _is_azure_blob_url(model_path: str) -> bool:
return model_path.startswith(("https://", "http://")) and "blob.core.windows.net" in model_path
def _download_model_from_hf(output_dir: Path, model_id: str = DEFAULT_HF_MODEL_ID) -> str:
"""Download model from HuggingFace Hub if not already present."""
if snapshot_download is None:
raise ImportError(
"huggingface_hub is required for automatic model download. "
"Install it with: pip install huggingface_hub"
)
output_dir.mkdir(parents=True, exist_ok=True)
logging.info(f"Downloading {model_id} from HuggingFace to {output_dir}")
logging.info("This may take a while depending on your internet connection...")
try:
snapshot_download(
repo_id=model_id,
local_dir=str(output_dir),
local_dir_use_symlinks=False,
)
logging.info(f"Successfully downloaded model to {output_dir}")
return str(output_dir.resolve())
except Exception as e:
logging.error(f"Error downloading model: {e}")
logging.error("If you're getting authentication errors, you may need to:")
logging.error(" 1. Install huggingface-cli: pip install -U huggingface_hub")
logging.error(" 2. Login: huggingface-cli login")
logging.error(" 3. Or set HF_TOKEN environment variable")
raise
def _extract_model_name(model_url: str) -> str:
"""Extract model name from URL for consistent naming."""
url_parts = model_url.rstrip('/').split('/')
@@ -100,8 +139,12 @@ class AzVllm:
# It's a local directory
model_path = Path(model_url).expanduser()
if not model_path.exists():
raise FileNotFoundError(f"Local model directory not found: {model_url}")
self.local_model_path = str(model_path.resolve())
# Auto-download from HuggingFace if path doesn't exist
logging.warning(f"Local model directory not found: {model_url}")
logging.info(f"Attempting to download {DEFAULT_HF_MODEL_ID} from HuggingFace...")
self.local_model_path = _download_model_from_hf(model_path)
else:
self.local_model_path = str(model_path.resolve())
self.port = port
def __enter__(self):

View File

@@ -6,7 +6,7 @@ import os
import logging
# from aztool.workspace import Workspace, AIF_WORKSPACE
import mlflow
from eval_exp import EvalExp, ModelReference, get_default_vllm_model_config, get_foundry_endpoint_configs
from eval_exp import EvalExp, ModelReference, get_foundry_endpoint_configs
from webeval.oai_clients.graceful_client import GracefulRetryClient
from webeval.eval_result import EvalResult, Stage
from arg_parsing import get_eval_args
@@ -62,7 +62,12 @@ def main():
mlflow.log_param('using_external_endpoint', True)
mlflow.log_param('endpoint_config_path', ','.join([x['base_url'] for x in websurfer_client_cfg]))
else:
websurfer_client_cfg = get_default_vllm_model_config(args.model_port)
# For local VLLM, use a simple flat config structure that FaraAgent expects
websurfer_client_cfg = {
"api_key": "NONE",
"model": "gpt-4o-mini-2024-07-18",
"base_url": f"http://0.0.0.0:{args.model_port}/v1"
}
if args.web_surfer_client_cfg is not None:
websurfer_client_cfg = args.web_surfer_client_cfg
if args.web_surfer_kwargs:

View File

@@ -74,7 +74,52 @@ def remap_action_names(action_name: str) -> str:
return 'terminate'
else:
return action_name # Return as is if no remapping is needed
def parse_text_based_event(event: Dict) -> Dict | None:
"""
Parse events where thoughts and actions are embedded as text in the message field.
Expected format:
"Thought #X: <thought text>
Action #X: executing tool '<tool_name>' with arguments {<json>}"
Returns a dict with 'action' and 'arguments' fields, or None if not a thought/action event.
"""
import re
message = event.get('message', '')
# Check if this is a thought/action message
if 'Thought #' not in message or 'Action #' not in message:
return None
try:
# Extract thought
thought_match = re.search(r'Thought #\d+:\s*(.+?)(?=\nAction #)', message, re.DOTALL)
thought = thought_match.group(1).strip() if thought_match else ""
# Extract action arguments JSON
# Pattern: executing tool '<tool_name>' with arguments {json}
action_match = re.search(r'with arguments\s+(\{.+\})', message, re.DOTALL)
if not action_match:
return None
arguments = json.loads(action_match.group(1))
# Add thoughts to arguments
arguments['thoughts'] = thought
# Get action name from arguments
action_name = arguments.get('action', 'unknown')
return {
'action': action_name,
'arguments': arguments
}
except (json.JSONDecodeError, AttributeError):
# Failed to parse - return None
return None
class Trajectory:
def __init__(self, path, gpt_solver = False, skip_web_surfer_log = False):
self.path = Path(path)
@@ -102,9 +147,21 @@ class Trajectory:
metadata = json.load(f)
self.is_action = metadata.get("is_action", False)
# Check if events are text-based (no 'action' field in any event)
has_structured_events = any(e.get('action') is not None for e in self.events)
if not has_structured_events and self.events:
# Parse text-based events
parsed_events = []
for event in self.events:
parsed = parse_text_based_event(event)
if parsed:
parsed_events.append(parsed)
self.events = parsed_events
if gpt_solver:
# remove non-WebSurfer events e.g. WebSurfer-SummarizedAction and other miscellaneous comments from solving pipeline
self.events = [e for e in self.events if (e.get('source', None) == "WebSurfer" and e.get('action', None) is not None)]
self.events = [e for e in self.events if (e.get('source', None) == "WebSurfer" and e.get('action', None) is not None)]
# For gpt_solver, normalize events to have action in arguments for compatibility
for event in self.events:
if event.get('action') and 'arguments' in event and isinstance(event['arguments'], dict):