mirror of
https://github.com/microsoft/fara.git
synced 2026-06-10 02:54:01 +08:00
other fixes to webeval
This commit is contained in:
10
README.md
10
README.md
@@ -40,7 +40,7 @@ Then you can iterative query it with:
|
||||
fara-cli --task "whats the weather in new york now"
|
||||
```
|
||||
|
||||
Hint: might need to do `--tensor-parallel-size 2` if you run out of memory
|
||||
Hint: might need to do `--tensor-parallel-size 2` with vllm command if you run out of memory
|
||||
|
||||
|
||||
### What Makes Fara-7B Unique
|
||||
@@ -280,10 +280,12 @@ Navigate to the scripts directory:
|
||||
cd webeval/scripts
|
||||
```
|
||||
|
||||
Make sure you set a valid OpenAI GPT-4o endpoint in `endpoint_configs_gpt4o/dev` in order to run the WebVoyager LLM-as-a-judge!
|
||||
|
||||
**Option 1: Self-hosted VLLM**
|
||||
|
||||
```bash
|
||||
python webvoyager.py --model_url ../../model_checkpoints/fara-7b/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /data/data/Fara/eval --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
|
||||
python webvoyager.py --model_url /path/where/you/want/to/download/model/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
|
||||
```
|
||||
|
||||
**Option 2: Azure Foundry Deployment**
|
||||
@@ -291,14 +293,16 @@ python webvoyager.py --model_url ../../model_checkpoints/fara-7b/ --model_port 5
|
||||
Deploy [Fara-7B on Foundry endpoint(s)](https://ai.azure.com/explore/models/Fara-7B/version/2/registry/azureml-msr), then place endpoint URLs and keys in JSONs under `endpoint_configs/`:
|
||||
|
||||
```bash
|
||||
python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /data/data/Fara/eval --processes 1 --run_id 1_endpoint --max_rounds 100
|
||||
python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --processes 1 --run_id 1_endpoint --max_rounds 100
|
||||
```
|
||||
|
||||
### Notes
|
||||
|
||||
|
||||
- We use the same LLM-as-a-judge prompts and model (GPT-4o) as WebVoyager, hence the `--eval_oai_config` argument
|
||||
- Set `--browserbase` for browser session management (requires exported API key and project ID environment variables)
|
||||
- Avoid overloading a single VLLM deployment with more than ~10 concurrent processes due to known issues
|
||||
- See debugging output in `fara/webeval/scripts/stdout.txt`
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -8,8 +8,9 @@ import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
from vllm_facade import VLLM, Status
|
||||
from .vllm_facade import VLLM, Status
|
||||
|
||||
try:
|
||||
from aztool.azcp import AzFolder, LocalFolder
|
||||
@@ -17,11 +18,49 @@ except ImportError: # keep old behaviour when aztool missing
|
||||
AzFolder = None
|
||||
LocalFolder = None
|
||||
|
||||
try:
|
||||
from huggingface_hub import snapshot_download
|
||||
except ImportError:
|
||||
snapshot_download = None
|
||||
|
||||
# Hardcoded HuggingFace model ID for automatic download
|
||||
DEFAULT_HF_MODEL_ID = "microsoft/Fara-7B"
|
||||
|
||||
|
||||
def _is_azure_blob_url(model_path: str) -> bool:
|
||||
return model_path.startswith(("https://", "http://")) and "blob.core.windows.net" in model_path
|
||||
|
||||
|
||||
def _download_model_from_hf(output_dir: Path, model_id: str = DEFAULT_HF_MODEL_ID) -> str:
|
||||
"""Download model from HuggingFace Hub if not already present."""
|
||||
if snapshot_download is None:
|
||||
raise ImportError(
|
||||
"huggingface_hub is required for automatic model download. "
|
||||
"Install it with: pip install huggingface_hub"
|
||||
)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info(f"Downloading {model_id} from HuggingFace to {output_dir}")
|
||||
logging.info("This may take a while depending on your internet connection...")
|
||||
|
||||
try:
|
||||
snapshot_download(
|
||||
repo_id=model_id,
|
||||
local_dir=str(output_dir),
|
||||
local_dir_use_symlinks=False,
|
||||
)
|
||||
logging.info(f"Successfully downloaded model to {output_dir}")
|
||||
return str(output_dir.resolve())
|
||||
except Exception as e:
|
||||
logging.error(f"Error downloading model: {e}")
|
||||
logging.error("If you're getting authentication errors, you may need to:")
|
||||
logging.error(" 1. Install huggingface-cli: pip install -U huggingface_hub")
|
||||
logging.error(" 2. Login: huggingface-cli login")
|
||||
logging.error(" 3. Or set HF_TOKEN environment variable")
|
||||
raise
|
||||
|
||||
|
||||
def _extract_model_name(model_url: str) -> str:
|
||||
"""Extract model name from URL for consistent naming."""
|
||||
url_parts = model_url.rstrip('/').split('/')
|
||||
@@ -100,8 +139,12 @@ class AzVllm:
|
||||
# It's a local directory
|
||||
model_path = Path(model_url).expanduser()
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f"Local model directory not found: {model_url}")
|
||||
self.local_model_path = str(model_path.resolve())
|
||||
# Auto-download from HuggingFace if path doesn't exist
|
||||
logging.warning(f"Local model directory not found: {model_url}")
|
||||
logging.info(f"Attempting to download {DEFAULT_HF_MODEL_ID} from HuggingFace...")
|
||||
self.local_model_path = _download_model_from_hf(model_path)
|
||||
else:
|
||||
self.local_model_path = str(model_path.resolve())
|
||||
self.port = port
|
||||
|
||||
def __enter__(self):
|
||||
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
import logging
|
||||
# from aztool.workspace import Workspace, AIF_WORKSPACE
|
||||
import mlflow
|
||||
from eval_exp import EvalExp, ModelReference, get_default_vllm_model_config, get_foundry_endpoint_configs
|
||||
from eval_exp import EvalExp, ModelReference, get_foundry_endpoint_configs
|
||||
from webeval.oai_clients.graceful_client import GracefulRetryClient
|
||||
from webeval.eval_result import EvalResult, Stage
|
||||
from arg_parsing import get_eval_args
|
||||
@@ -62,7 +62,12 @@ def main():
|
||||
mlflow.log_param('using_external_endpoint', True)
|
||||
mlflow.log_param('endpoint_config_path', ','.join([x['base_url'] for x in websurfer_client_cfg]))
|
||||
else:
|
||||
websurfer_client_cfg = get_default_vllm_model_config(args.model_port)
|
||||
# For local VLLM, use a simple flat config structure that FaraAgent expects
|
||||
websurfer_client_cfg = {
|
||||
"api_key": "NONE",
|
||||
"model": "gpt-4o-mini-2024-07-18",
|
||||
"base_url": f"http://0.0.0.0:{args.model_port}/v1"
|
||||
}
|
||||
if args.web_surfer_client_cfg is not None:
|
||||
websurfer_client_cfg = args.web_surfer_client_cfg
|
||||
if args.web_surfer_kwargs:
|
||||
|
||||
@@ -74,7 +74,52 @@ def remap_action_names(action_name: str) -> str:
|
||||
return 'terminate'
|
||||
else:
|
||||
return action_name # Return as is if no remapping is needed
|
||||
|
||||
|
||||
def parse_text_based_event(event: Dict) -> Dict | None:
|
||||
"""
|
||||
Parse events where thoughts and actions are embedded as text in the message field.
|
||||
|
||||
Expected format:
|
||||
"Thought #X: <thought text>
|
||||
Action #X: executing tool '<tool_name>' with arguments {<json>}"
|
||||
|
||||
Returns a dict with 'action' and 'arguments' fields, or None if not a thought/action event.
|
||||
"""
|
||||
import re
|
||||
|
||||
message = event.get('message', '')
|
||||
|
||||
# Check if this is a thought/action message
|
||||
if 'Thought #' not in message or 'Action #' not in message:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Extract thought
|
||||
thought_match = re.search(r'Thought #\d+:\s*(.+?)(?=\nAction #)', message, re.DOTALL)
|
||||
thought = thought_match.group(1).strip() if thought_match else ""
|
||||
|
||||
# Extract action arguments JSON
|
||||
# Pattern: executing tool '<tool_name>' with arguments {json}
|
||||
action_match = re.search(r'with arguments\s+(\{.+\})', message, re.DOTALL)
|
||||
if not action_match:
|
||||
return None
|
||||
|
||||
arguments = json.loads(action_match.group(1))
|
||||
|
||||
# Add thoughts to arguments
|
||||
arguments['thoughts'] = thought
|
||||
|
||||
# Get action name from arguments
|
||||
action_name = arguments.get('action', 'unknown')
|
||||
|
||||
return {
|
||||
'action': action_name,
|
||||
'arguments': arguments
|
||||
}
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
# Failed to parse - return None
|
||||
return None
|
||||
|
||||
class Trajectory:
|
||||
def __init__(self, path, gpt_solver = False, skip_web_surfer_log = False):
|
||||
self.path = Path(path)
|
||||
@@ -102,9 +147,21 @@ class Trajectory:
|
||||
metadata = json.load(f)
|
||||
self.is_action = metadata.get("is_action", False)
|
||||
|
||||
# Check if events are text-based (no 'action' field in any event)
|
||||
has_structured_events = any(e.get('action') is not None for e in self.events)
|
||||
|
||||
if not has_structured_events and self.events:
|
||||
# Parse text-based events
|
||||
parsed_events = []
|
||||
for event in self.events:
|
||||
parsed = parse_text_based_event(event)
|
||||
if parsed:
|
||||
parsed_events.append(parsed)
|
||||
self.events = parsed_events
|
||||
|
||||
if gpt_solver:
|
||||
# remove non-WebSurfer events e.g. WebSurfer-SummarizedAction and other miscellaneous comments from solving pipeline
|
||||
self.events = [e for e in self.events if (e.get('source', None) == "WebSurfer" and e.get('action', None) is not None)]
|
||||
self.events = [e for e in self.events if (e.get('source', None) == "WebSurfer" and e.get('action', None) is not None)]
|
||||
# For gpt_solver, normalize events to have action in arguments for compatibility
|
||||
for event in self.events:
|
||||
if event.get('action') and 'arguments' in event and isinstance(event['arguments'], dict):
|
||||
|
||||
Reference in New Issue
Block a user