online mind2web works

2026-06-10 02:54:01 +08:00 · 2025-11-25 22:47:53 -08:00
parent f71083c9bd
commit 1d6d1d64a6
30 changed files with 3718 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -286,6 +286,8 @@ Make sure you set a valid OpenAI GPT-4o endpoint in `endpoint_configs_gpt4o/dev`

 ```bash
 python webvoyager.py --model_url /path/where/you/want/to/download/model/ --model_port 5000 --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
+python om2w.py --model_url /path/where/you/want/to/download/model/ --model_port 5000 --eval_oai_config ../endpoint_configs_o4/dev/ --eval_model o4-mini --out_url /path/to/save/eval/files --device_id 0,1 --processes 1 --run_id 1 --max_rounds 100
+
 ```

 **Option 2: Azure Foundry Deployment**
@@ -294,6 +296,7 @@ Deploy [Fara-7B on Foundry endpoint(s)](https://ai.azure.com/explore/models/Fara

 ```bash
 python webvoyager.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_gpt4o/dev/ --out_url /path/to/save/eval/files --processes 1 --run_id 1_endpoint --max_rounds 100
+python om2w.py --model_endpoint ../../endpoint_configs/ --eval_oai_config ../endpoint_configs_o4/dev/ --eval_model o4-mini --out_url /path/to/save/eval/files --processes 1 --run_id 1_endpoint --max_rounds 100
 ```

 ### Notes
@@ -324,10 +327,12 @@ Example path:
 Each evaluation folder contains:
 - `gpt_eval/` - LLM-as-a-judge evaluation results
 - `traj/` - Per-task trajectory subdirectories containing:
-  - `final_answer.json` (e.g., `Amazon--1_final_answer.json`) - `<no_answer>` indicates abortion or step budget exceeded
-  - `scores/gpt_eval.json` - LLM judge scores
+  - `*-final_answer.json` (e.g., `Amazon--1_final_answer.json`) - `<no_answer>` indicates abortion or step budget exceeded
+  - `scores/*_eval.json` - LLM judge scores (`gpt_eval.json` for WebVoyager, `WebJudge_Online_Mind2Web_eval-3.json` for Online-Mind2Web)
  - `web_surfer.log` - Action history and errors
  - `screenshot_X.png` - Screenshots captured before each action X
+  - `times.json` - contains start and endtimes of task
+  - `core.log` - contains high level logs like if the trajectory needs to be started or is already cached/done, the evaluation score, duration, and errors encountered

 ### Running Analysis

@@ -357,6 +362,18 @@ To re-run failed tasks, execute the evaluation script again with the same `run_i

 </details>

+<details>
+<summary>Example of web_surfer.log (action history)</summary>
+
+```json
+{"message": "\nThought #1: To begin locating the closest GameStop store, I will click on the 'Trade-In' button, which typically leads to a store locator feature on retail websites.\nAction #1: executing tool 'left_click' with arguments {\"action\": \"left_click\", \"coordinate\": [1074, 37]}", "timestamp": "2025-11-25T22:23:42.788320", "type": "OtherEvent"}
+{"message": "Observation#1: I clicked at coordinates (1083.0252100840337, 37.16517857142858).", "timestamp": "2025-11-25T22:23:47.616613", "type": "OtherEvent"}
+{"message": "\nThought #2: Since access to Gamestop's website has been blocked, I will search for alternative sources of store location and hours for GameStop locations near zip code 90028. This could include cached or third-party website information.\nAction #2: executing tool 'web_search' with arguments {\"action\": \"web_search\", \"query\": \"GameStop store locator zip code 90028\"}", "timestamp": "2025-11-25T22:23:50.579702", "type": "OtherEvent"}
+{"message": "Observation#2: I typed 'GameStop store locator zip code 90028' into the browser search bar.", "timestamp": "2025-11-25T22:23:55.119495", "type": "OtherEvent"}
+{"message": "\nThought #3: The closest GameStop store to zip code 90028 is located at 5533 W Sunset Blvd, Los Angeles, CA 90028. It is currently closed and will open tomorrow at 11 AM. The contact number is (323) 466-3714.\nAction #3: executing tool 'terminate' with arguments {\"action\": \"terminate\", \"status\": \"success\"}", "timestamp": "2025-11-25T22:23:58.729038", "type": "OtherEvent"}
+{"message": "Observation#3: The closest GameStop store to zip code 90028 is located at 5533 W Sunset Blvd, Los Angeles, CA 90028. It is currently closed and will open tomorrow at 11 AM. The contact number is (323) 466-3714.", "timestamp": "2025-11-25T22:24:02.379069", "type": "OtherEvent"}
+```
+
 ---

 ## Citation
--- a/webeval/data/om2w/Online_Mind2Web_06042025.json
+++ b/webeval/data/om2w/Online_Mind2Web_06042025.json
--- a/webeval/endpoint_configs_o4/dev/o4mini-0-config.json
+++ b/webeval/endpoint_configs_o4/dev/o4mini-0-config.json
@@ -0,0 +1,16 @@
+{
+  "CHAT_COMPLETION_PROVIDER": "azure",
+  "CHAT_COMPLETION_KWARGS_JSON": {
+    "api_version": "2025-04-01-preview",
+    "azure_endpoint": "https://<your-endpoint>.openai.azure.com/",
+    "model_capabilities": {
+      "function_calling": true,
+      "json_output": true,
+      "vision": true
+    },
+    "azure_deployment": "<your-deployment-name>",
+    "azure_ad_token_provider": "default",
+    "model": "o4-mini",
+    "max_retries": 5
+  }
+}
--- a/webeval/scripts/om2w.py
+++ b/webeval/scripts/om2w.py
@@ -0,0 +1,123 @@
+from webeval.systems.websurfer import WebSurferSystem
+from webeval.benchmarks import OnlineM2WBenchmark
+from pathlib import Path
+import numpy as np
+import os
+import logging
+# from aztool.workspace import Workspace, AIF_WORKSPACE
+import mlflow
+from eval_exp import EvalExp, ModelReference, get_foundry_endpoint_configs
+from webeval.oai_clients.graceful_client import GracefulRetryClient
+from webeval.eval_result import EvalResult, Stage
+from arg_parsing import get_eval_args
+
+DEFAULT_DATA_URL = '../data/om2w/Online_Mind2Web_06042025.json'
+
+class Callback:
+    def __init__(self):
+        self.scores = []
+
+    def __call__(self, result: EvalResult, mlflow_facade, mlflow_run_id: str):
+        if result.stage == Stage.EVALUATED:
+            self.scores.append(result.score)
+        mlflow_facade.log_metric('score', np.mean(self.scores or [0]), run_id = mlflow_run_id)
+
+
+def add_om2w_args(parser):
+    parser.add_argument('--eval_data_url', type=str, default=DEFAULT_DATA_URL, help='Azure URI to the evaluation data')
+    parser.add_argument('--split', type=str, default='*')
+    parser.add_argument('--eval_method', type=str, default='WebJudge_Online_Mind2Web_eval', help='Evaluation method (default: WebJudge_Online_Mind2Web_eval, one of: AgentTrek_eval, Autonomous_eval, WebJudge_general_eval, WebJudge_Online_Mind2Web_eval)')
+
+
+def main():
+    args = get_eval_args(add_om2w_args)
+    assert args.eval_model == "o4-mini", "only o4-mini can be used for om2w eval"
+
+    if args.browserbase:
+        assert os.environ.get("BROWSERBASE_API_KEY"), "BROWSERBASE_API_KEY environment variable must be set to use browserbase"
+        assert os.environ.get("BROWSERBASE_PROJECT_ID"), "BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment variables must be set to use browserbase"
+
+    experiment = EvalExp(
+        ws = None,
+        user = args.user,
+        seed = args.seed)
+  
+    with experiment.start_run() as run:
+        model_ref = ModelReference(args.model_url, args.model_port, args.device_id, args.web_surfer_kwargs.get('max_n_images', 3), args.gpt_solver_model_name, args.dtype, args.enforce_eager, use_external_endpoint=bool(args.model_endpoint))
+
+        logger = logging.getLogger('om2w-eval')
+        logger.setLevel(logging.INFO)
+
+        mlflow.log_param('max_rounds', args.max_rounds)
+        if args.web_surfer_model_type == "gpt_solver":
+            mlflow.log_param('web_surfer_model_type', args.web_surfer_model_type + "/" + args.gpt_solver_model_name)
+        else:
+            mlflow.log_param('web_surfer_model_type', args.web_surfer_model_type)
+        mlflow.log_param('fn_call_template', args.fn_call_template)
+
+        # If using external endpoint, load all endpoint configs into a list of dicts
+        if args.model_endpoint:
+            websurfer_client_cfg = get_foundry_endpoint_configs(args.model_endpoint)
+            logger.info(f"Loaded {len(websurfer_client_cfg)} external endpoint config(s) from {args.model_endpoint}")
+            model_ref.model_url_to_log = websurfer_client_cfg[0]['base_url']  # log the first endpoint URL as
+            model_ref.model_to_log = websurfer_client_cfg[0]['base_url']
+            mlflow.log_param('using_external_endpoint', True)
+            mlflow.log_param('endpoint_config_path', ','.join([x['base_url'] for x in websurfer_client_cfg]))
+        else:
+            # For local VLLM, use a simple flat config structure that FaraAgent expects
+            websurfer_client_cfg = {
+                "api_key": "NONE",
+                "model": "gpt-4o-mini-2024-07-18",
+                "base_url": f"http://0.0.0.0:{args.model_port}/v1"
+            }
+            if args.web_surfer_client_cfg is not None:
+                websurfer_client_cfg = args.web_surfer_client_cfg
+        if args.web_surfer_kwargs:
+            mlflow.log_params({f'web_surfer_kwargs.{k}': v  for k, v in args.web_surfer_kwargs.items()})
+
+        system = WebSurferSystem(
+            system_name="WebSurfer",
+            web_surfer_model_type = args.web_surfer_model_type,
+            max_rounds = args.max_rounds,
+            websurfer_client_cfg = websurfer_client_cfg,
+            start_on_target_url=True,
+            browserbase=args.browserbase,
+            web_surfer_kwargs=args.web_surfer_kwargs,
+            gpt_solver_model_name=args.gpt_solver_model_name,
+            fn_call_template=args.fn_call_template,
+            step_budgets=args.step_budgets,
+        )
+
+
+        mlflow.log_param("eval_data", args.eval_data_url)
+        mlflow.log_param("eval_model", args.eval_model)
+        mlflow.log_param("eval_method", args.eval_method)
+        data_dir = Path(__file__).resolve().parent.parent / "data" / "om2w"
+        data_dir.mkdir(parents=True, exist_ok=True)
+        benchmark = OnlineM2WBenchmark(
+            data_dir=data_dir,
+            eval_method = args.eval_method,
+            data_az_url = args.eval_data_url,
+            model_client = GracefulRetryClient.from_path(args.eval_oai_config, logger=logger, eval_model=args.eval_model))
+
+        mlflow.log_param('subsample', args.subsample)
+        mlflow.log_param('processes', args.processes)
+        mlflow.log_param('split', args.split)
+        mlflow.log_param('max_error_task_retries', args.max_error_task_retries)
+        experiment.run(
+            model_ref = model_ref,
+            system = system,
+            benchmark = benchmark,
+            out_url = args.out_url,
+            subsample = args.subsample,
+            redo_eval = args.redo_eval,
+            run_id = args.run_id,
+            split = args.split,
+            processes = args.processes,
+            callbacks = [Callback()],
+            eval_only = args.eval_only,
+            max_error_task_retries = args.max_error_task_retries)
+
+
+if __name__ == "__main__":
+    main()
--- a/webeval/scripts/test_trajectory_loading.py
+++ b/webeval/scripts/test_trajectory_loading.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Test script to verify Trajectory class can load text-based log files.
+"""
+
+import sys
+from pathlib import Path
+
+# Add the src directory to path
+sys.path.insert(0, '/mnt/users/wangzhe/code/fara/webeval/src')
+
+from webeval.trajectory import Trajectory
+
+def main():
+    traj_path = Path('/data/data/Fara/eval/runs/WebSurfer-fara-100-max_n_images-3/model_checkpoints/corbyrosset/WebVoyager_WebVoyager_data_08312025.jsonl/Nov252025/traj/Allrecipes--0/')
+
+    print("="*80)
+    print("Testing Trajectory Loading")
+    print("="*80)
+    print(f"\nTrajectory path: {traj_path}")
+    print(f"Path exists: {traj_path.exists()}\n")
+
+    try:
+        # Load the trajectory
+        print("Loading trajectory...")
+        traj = Trajectory(traj_path)
+
+        print("\n" + "="*80)
+        print("✓ Trajectory loaded successfully!")
+        print("="*80)
+
+        # Display summary
+        print(f"\nTrajectory Summary:")
+        print(f"  - Representation: {repr(traj)}")
+        print(f"  - Number of events: {len(traj.events)}")
+        print(f"  - Number of actions: {len(traj.actions)}")
+        print(f"  - Number of thoughts: {len(traj.thoughts)}")
+        print(f"  - Number of screenshots: {len(traj.screenshots)}")
+        print(f"  - Is aborted: {traj.is_aborted}")
+
+        # Display actions and thoughts
+        print("\n" + "="*80)
+        print("Actions and Thoughts")
+        print("="*80)
+
+        for i, (thought, action) in enumerate(zip(traj.thoughts, traj.actions), 1):
+            print(f"\n{'─'*80}")
+            print(f"Action {i}:")
+            print(f"{'─'*80}")
+            print(f"Thought: {thought}")
+            print(f"Action:  {action}")
+
+        # Display answer information
+        print("\n" + "="*80)
+        print("Final Answer")
+        print("="*80)
+        print(f"Answer: {traj.answer.final_answer}")
+        print(f"Is aborted: {traj.answer.is_aborted}")
+
+        # Display token usage if available
+        if traj.answer.token_usage:
+            print("\n" + "="*80)
+            print("Token Usage")
+            print("="*80)
+            for key, usage in traj.answer.token_usage.items():
+                print(f"  {key}: {usage}")
+
+        print("\n" + "="*80)
+        print("✓ Test completed successfully!")
+        print("="*80)
+
+    except Exception as e:
+        print("\n" + "="*80)
+        print("✗ Failed to load trajectory!")
+        print("="*80)
+        print(f"Error: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/webeval/scripts/webvoyager.py
+++ b/webeval/scripts/webvoyager.py
@@ -12,7 +12,7 @@ from webeval.eval_result import EvalResult, Stage
 from arg_parsing import get_eval_args


-DEFAULT_DATA_URL = '../data/WebVoyager_data_08312025.jsonl'
+DEFAULT_DATA_URL = '../data/webvoyager/WebVoyager_data_08312025.jsonl'

 class Callback:
    def __init__(self):
--- a/webeval/src/webeval/benchmarks/init.py
+++ b/webeval/src/webeval/benchmarks/init.py
@@ -4,7 +4,7 @@
 from .webvoyager.webvoyager import WebVoyagerBenchmark
 # from .mdm2w.mdm2w import MultimodalM2WBenchmark
 # from .aurora.aurora import AuroraBenchmark
-# from .om2w.om2w import OnlineM2WBenchmark
+from .om2w.om2w import OnlineM2WBenchmark
 # from .holdout.holdout import HoldoutBenchmark
 # from .deepshop.deepshop import DeepShopBenchmark
 # from .browsecomp.browsecomp import BrowseCompBenchmark
@@ -23,7 +23,7 @@ __all__ = [
    "WebVoyagerBenchmark",
    # "MultimodalM2WBenchmark",
    # "AuroraBenchmark",
-    # "OnlineM2WBenchmark",
+    "OnlineM2WBenchmark",
    # "HoldoutBenchmark",
    # "DeepShopBenchmark",
    # "BrowseCompBenchmark",
--- a/webeval/src/webeval/benchmarks/om2w/init.py
+++ b/webeval/src/webeval/benchmarks/om2w/init.py
--- a/webeval/src/webeval/benchmarks/om2w/impl/.gitattributes
+++ b/webeval/src/webeval/benchmarks/om2w/impl/.gitattributes
--- a/webeval/src/webeval/benchmarks/om2w/impl/.gitignore
+++ b/webeval/src/webeval/benchmarks/om2w/impl/.gitignore
--- a/webeval/src/webeval/benchmarks/om2w/impl/README.md
+++ b/webeval/src/webeval/benchmarks/om2w/impl/README.md
@@ -0,0 +1,228 @@
+<h1 align="center"> Online-Mind2Web Benchmark </h1>
+
+<p align="center">
+  <a href="https://xuetianci.github.io/" style="text-decoration: none;">Tianci Xue<sup>,1</sup></a>, 
+  <a href="https://x.com/weijian_qi" style="text-decoration: none;">Weijian Qi<sup>*,1</sup></a>,
+  <a href="https://tnshi.com/" style="text-decoration: none;">Tianneng Shi<sup>*2</sup></a>,
+  <a href="https://chanh.ee/" style="text-decoration: none;">Chan Hee Song<sup>1</sup></a>,
+  <a href="https://boyugou.github.io/" style="text-decoration: none;">Boyu Gou<sup>1</sup></a>,
+  <a href="https://dawnsong.io/" style="text-decoration: none;">Dawn Song<sup>,2</sup></a>,
+  <a href="https://u.osu.edu/ihudas/people/" style="text-decoration: none;">Huan Sun<sup>†,1</sup></a>
+  <a href="https://ysu1989.github.io/" style="text-decoration: none;">Yu Su<sup>†,1</sup></a>
+</p>
+
+<p align="center">
+  <sup>1</sup>The Ohio State University, <sup>2</sup>University of California, Berkeley </br>
+  <sub><sup>*</sup>Equal contribution, <sup>†</sup>Equal advising</sub>
+</p>
+
+<p align="center">
+<a href="https://arxiv.org/abs/2504.01382">📃 Paper</a>
+•
+<a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">📃 Blog</a>
+•
+<a href="https://huggingface.co/spaces/osunlp/Online_Mind2Web_Leaderboard" >🏆 Leaderboard</a>
+•
+<a href="https://huggingface.co/datasets/osunlp/Online-Mind2Web" >🤗 Data</a>
+</p>
+
+
+# Online-Mind2Web benchmark
+
+## News
+* [05/11/2025] Check out our updates in the [paper](https://arxiv.org/abs/2504.01382).
+  * The performance of Claude Computer Use 3.7.
+  * WebJudge(o4-mini) achieves high agreement (86%) with a low success rate gap (3.8%) compared with humans.
+  * Release [WebJudge-7B](https://huggingface.co/osunlp/WebJudge-7B), a robust and reliable reward model for Reinforcement learning.
+
+## Tasks
+Online-Mind2Web includes 300 diverse tasks from 136 popular websites across various domains. It covers a diverse set of real-world user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment.
+
+## Update Tasks
+
+We will regularly update Online-Mind2Web by replacing outdated or invalid tasks (e.g., due to website changes) to maintain its value as a rigorous benchmark for web agents. If you find any tasks are outdated, please reach out to us, and we will update them.
+
+To ensure fair comparisons, we will aim to keep the updated tasks on the same websites as before and with a similar reference length. Additionally, once agent performance saturates on Online-Mind2Web, we will also revise simple tasks to preserve its long-term value.
+
+# Automatic Evaluator via LLM-as-a-Judge (WebJudge)
+To enhance the reliability and scalability of the evaluation process in online environments, We propose a more reliable automatic evaluation method called **WebJudge**, which consists of three parts. (1) Key Point Identification: The model is prompted to identify several key points necessary for completing the task, based on the given instruction and task description. (2) Key Screenshot Identification: Important screenshots are selected from the agent’s trajectory to retain relevant visual evidence while discarding uninformative frames. (3) Outcome Judgment: Output the judgement result based on the task description, key points, key screenshots, and the action history. Our method preserves critical intermediate screenshots while mitigating the token overload issue.
+
+<p align="center">
+  <img src="./images/WebJudge.jpg" width="100%" alt="pipeline">
+</p>
+
+# Results
+
+## Comparison against Existing Evaluation Methods on Online-Mind2Web
+<table>
+<tr>
+  <th>Model</th>
+  <th>Auto-Eval</th>
+  <td>SeeAct</td>
+  <td>Agent-E</td>
+  <td>Browser Use</td>
+  <td>Claude 3.5 </td>
+  <td>Claude 3.7</td>
+  <td>Operator</td>
+  <th>Avg AR</th>
+</tr>
+<tr>
+  <th rowspan="4">GPT-4o</th>
+  <td>Autonomous Eval</td>
+  <td>84.7</td>
+  <td>85.0</td>
+  <td>76.0</td>
+  <td>83.7</td>
+  <td>75.5</td>
+  <td>71.7</td>
+  <td>79.4</td>
+</tr>
+<tr>
+  <td>AgentTrek Eval</td>
+  <td>73.0</td>
+  <td>64.3</td>
+  <td>63.3</td>
+  <td>--</td>
+  <td>--</td>
+  <td>--</td>
+  <td>66.9</td>
+</tr>
+<tr>
+  <td>WebVoyager</td>
+  <td>--</td>
+  <td>75.3</td>
+  <td>71.3</td>
+  <td>74.0</td>
+  <td>72.0</td>
+  <td>76.7</td>
+  <td>73.9</td>
+</tr>
+<tr>
+  <td>WebJudge</td>
+  <td>86.7</td>
+  <td>86.0</td>
+  <td>81.4</td>
+  <td>86.3</td>
+  <td>79.1</td>
+  <td>81.8</td>
+  <td><b>83.6</b></td>
+</tr>
+
+<tr>
+  <th rowspan="3">o4-mini</th>
+  <td>Autonomous Eval</td>
+  <td>79.7</td>
+  <td>85.7</td>
+  <td>86.0</td>
+  <td>84.3</td>
+  <td>68.0</td>
+  <td>73.3</td>
+  <td>79.5</td>
+</tr>
+<tr>
+  <td>WebVoyager</td>
+  <td>--</td>
+  <td>80.3</td>
+  <td>79.0</td>
+  <td>81.7</td>
+  <td>74.3</td>
+  <td>78.3</td>
+  <td>78.7</td>
+</tr>
+<tr>
+  <td>WebJudge</td>
+  <td>85.3</td>
+  <td>86.3</td>
+  <td>89.3</td>
+  <td>87.0</td>
+  <td>82.3</td>
+  <td>83.7</td>
+  <td><b>85.7</b></td>
+</tr>
+
+<tr>
+  <th></th>
+  <td>WebJudge-7B</td>
+  <td>86.0</td>
+  <td>87.3</td>
+  <td>88.3</td>
+  <td>89.7</td>
+  <td>84.3</td>
+  <td>86.3</td>
+  <td><b>87.0</b></td>
+</tr>
+</table>
+WebJudge powered by GPT-4o and o4-mini consistently achieves the highest agreement, with averages of 83.6% and 85.7%, respectively. Meanwhile, WebJudge-7B even outperforms o4-mini, reaching a high agreement with human judgment of 87%.
+
+
+## Excellent generalization capabilities on [AgentRewardBench](https://agent-reward-bench.github.io/) (5 OOD benchmarks)
+| **Methods** | **AB** | **VWA** | **WA** | **Work** | **Wk++** | **Overall** |
+|--------------|--------|--------|--------|----------|----------|--------------|
+| *Rule-based** | 25.0 | **85.2** | 79.0 | 100.0 | 83.3 | 83.8 |
+| Autonomous Eval* | 83.3 | 61.2 | 67.6 | 96.4 | 59.3 | 67.6 |
+| GPT-4o (A11y Tree)* | 77.8 | 63.0 | 70.2 | 94.6 | 63.0 | 69.8 |
+| WebJudge (GPT-4o) | 66.7 | 69.8 | 72.6 | 92.3 | 75.0 | 73.7 |
+| WebJudge-7B | 80.0 | 66.7 | 77.5 | 100.0 | 70.0 | 75.7 |
+| WebJudge (o4-mini) | **100.0** | 74.5 | **81.2** | **100.0** | **90.0** | **82.0** |
+
+WebJudge significantly outperforms existing methods, achieving impressive overall precision of 73.7% 75.7% and 82.0% on WebArena (WA), VisualWebArena (VWA), AssistantBench (AB), WorkArena (Work) and WorkArena++ (Wk++) across 1302 trajectories.
+
+The high precision suggests that WebJudge holds potential as a robust and scalable reward model for downstream applications such as Rejection Sampling Fine-Tuning, Reflection, and Reinforcement Learning.
+
+# Model Release
+We have released the fine-tuned [WebJudge-7B](https://huggingface.co/osunlp/WebJudge-7B) weights, which are now available on Hugging Face.
+
+# Setup Environment
+
+Create a conda environment and install dependencies:
+```
+conda create -n Online_Mind2Web python=3.11
+conda activate Online_Mind2Web
+pip install -r requirements.txt
+```
+
+# Evaluation
+You can run the provided example evaluation script directly to perform the evaluation. Adjust the "mode" parameter to choose among various auto-eval methods.
+```bash
+bash ./script/eval.sh
+```
+
+## Important Notes for Reliable Evaluation on Online-Mind2Web:
+> [!IMPORTANT]
+> - **Start from the specified websites, not Google Search**:To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
+> - **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
+> - **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
+
+# Evaluation Results
+
+In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
+
+- **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/online_mind2web_evaluation_results/human_label.json).
+- **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/online_mind2web_evaluation_results).
+
+## 📚 Citation
+
+Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data.
+```
+@article{xue2025illusionprogressassessingcurrent,
+      title={An Illusion of Progress? Assessing the Current State of Web Agents}, 
+      author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su},
+      year={2025},
+      eprint={2504.01382},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2504.01382}, 
+}
+
+@inproceedings{deng2023mind2web,
+ author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
+ pages = {28091--28114},
+ publisher = {Curran Associates, Inc.},
+ title = {Mind2Web: Towards a Generalist Agent for the Web},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
+ volume = {36},
+ year = {2023}
+}
+```
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/README.md
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/README.md
@@ -0,0 +1,27 @@
+## Example
+
+You can refer to the [example](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f) format when conducting auto-eval. For each task, the result should be stored in a folder named as its `task_id`, containing :
+
+- `trajectory/`: Stores screenshots of each step.
+- `result.json`: Task metadata and action history.
+
+
+## 🧠 Human Labels
+
+The file `human_label.json` provides human evaluation labels for each task across different agents. The meaning of labels represent as following:
+
+- `0`: ❌ **Failure** – The agent did **not** successfully complete the task.
+- `1`: ✅ **Success** – The agent **successfully** completed the task.
+- `2`: 🚫 **Not Executable** – The agent was **unable to execute** the task due to external or system-related limitations.
+
+## ℹ️ Details on Label 2
+
+The label 2 indicates that the agent could not execute the task at all. The specific reasons vary by agent:
+
+- **SeeAct**: Some tasks raise internal bugs when the agent extracts elements from the web page.
+- **Operator**: Certain websites (e.g., Reddit) are inaccessible, preventing task execution.
+- **Claude Computer Use 3.7**: Due to updates on the websites, a few tasks became outdated during the testing phase..
+- **Agent-E** and **Browser Use**: Label 2 typically corresponds to tasks blocked by **CAPTCHAs**. There are only **two** tasks for Agent-E and **one** task for Browser Use.
+
+## Note
+The results on the leaderboard are averaged over three runs, so there may be slight differences compared to the auto-eval results shown here.
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/result.json
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/result.json
@@ -0,0 +1,17 @@
+{
+    "task_id": "fb7b4f784cfde003e2548fdf4e8d6b4f",
+    "task": "Open the page with an overview of the submission of releases on Discogs.",
+    "final_result_response": "The page with an overview of submission guidelines for releases on Discogs is open: https://support.discogs.com/hc/en-us/articles/360004016474-Overview-Of-Submission-Guidelines-For-Releases",
+    "action_history": [
+        "<div role=\"button\"> -> CLICK",
+        "<button> -> CLICK",
+        "<button> -> CLICK",
+        "<a href=\"https://support.discogs.com/hc/articles/360004016474-Overview-of-Submission-Guidelines-for-Releases\" role=\"menuitem\"> -> CLICK"
+    ],
+    "thoughts": [
+        "Navigate to the section about submission of releases on Discogs.",
+        "Accept cookies to ensure smooth navigation, then look for a link related to submissions or guidelines.",
+        "Explore the 'Community' section to find information about submission of releases.",
+        "Click on 'Database Guidelines' to find information about submission of releases."
+    ]
+}
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/0_full_screenshot.png
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/0_full_screenshot.png
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/1_full_screenshot.png
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/1_full_screenshot.png
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/2_full_screenshot.png
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/2_full_screenshot.png
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/3_full_screenshot.png
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/3_full_screenshot.png
--- a/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/4_full_screenshot.png
+++ b/webeval/src/webeval/benchmarks/om2w/impl/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f/trajectory/4_full_screenshot.png
--- a/webeval/src/webeval/benchmarks/om2w/impl/images/WebJudge.jpg
+++ b/webeval/src/webeval/benchmarks/om2w/impl/images/WebJudge.jpg
--- a/webeval/src/webeval/benchmarks/om2w/impl/requirements.txt
+++ b/webeval/src/webeval/benchmarks/om2w/impl/requirements.txt
@@ -0,0 +1,3 @@
+backoff==2.2.1
+openai==1.68.2
+Pillow==11.1.0
--- a/webeval/src/webeval/benchmarks/om2w/impl/script/eval.sh
+++ b/webeval/src/webeval/benchmarks/om2w/impl/script/eval.sh
@@ -0,0 +1,24 @@
+
+api_key=API_KEY
+model_name=MODEL_NAME
+
+#Automatic evaluation method
+modes=(
+    "WebJudge_Online_Mind2Web_eval"
+    "WebJudge_general_eval"
+    "Autonomous_eval"
+    "WebVoyager_eval"
+    "AgentTrek_eval"
+)
+
+base_dir="./data/example"
+for mode in "${modes[@]}"; do
+    python ./src/run.py \
+        --mode "$mode" \
+        --model "${model_name}" \
+        --trajectories_dir "$base_dir" \
+        --api_key "${api_key}" \
+        --output_path ${base_dir}_result \
+        --num_worker 1 \
+        --score_threshold 3
+done
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/clean_html.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/clean_html.py
@@ -0,0 +1,42 @@
+from typing import Any, Iterable, List
+from bs4 import BeautifulSoup
+
+SALIENT_ATTRIBUTES = (
+    "alt",
+    "aria-describedby",
+    "aria-label",
+    "aria-role",
+    "aria-controls",
+    "input-checked",
+    "label",
+    "name",
+    "option_selected",
+    "placeholder",
+    "readonly",
+    "text-value",
+    "title",
+    "value",
+    "data-gtm-label",
+    "href",
+    "role",
+)
+
+def process_element_tag(element: str, salient_attributes: Iterable[str]) -> str:
+    """Clean an HTML element string, keeping only salient_attributes."""
+    if not element.endswith(">"):
+        element += "'>"
+
+    soup = BeautifulSoup(element, "html.parser")
+    for tag in soup.find_all(True):
+        # Keep only salient attributes
+        filtered_attrs = {k: tag.attrs[k] for k in tag.attrs if k in salient_attributes}
+        name_val = filtered_attrs.pop("name", None)
+        new_tag = soup.new_tag(tag.name, **filtered_attrs)
+        if name_val:
+            new_tag["name"] = name_val
+        return str(new_tag).split(f"</{tag.name}>")[0]
+    return element
+
+if __name__ == "__main__":
+    text = '<input type=\"text\" name=\"q\" id=\"mntl-search-form--open__search-input\" class=\"mntl-search-form__input\" placeholder=\"Find a recipe or ingredient\" required=\"required\" value=\"\" autocomplete=\"off\" style=\"\"> -> TYPE beef sirloin'
+    print(process_element_tag(text, SALIENT_ATTRIBUTES))
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/methods/agenttrek_eval.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/methods/agenttrek_eval.py
@@ -0,0 +1,56 @@
+from webeval.benchmarks.om2w.impl.src.utils import encode_image
+from PIL import Image
+
+def AgentTrek_eval(task, last_actions, thoughts, images_path):
+    system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task goal, the agent's trajectory, your goal is to decide whether the agent's execution is successful or not.
+
+*Evaluation Criteria* 
+Whether the agent's trajectory is effective and corresponding to the goal 
+
+*Instructions* 1. Review the agent's actions and reasoning processes step by step. 
+2. if the agent is stuck in the very first login stage, which means it fails to log into target website at the beginning, that's a failure. 
+3. Determine if the agent has achieved the task goal based on the trajectory. A task can be considered successful if most trajectory is effective. 
+4. the agent sometimes can't stop after finishing a task and continue doing repeated actions. these actions may be some failed attempt after a series of correct actions. the task should be regarded as successful if the correct actions are effective and almost reach the goal. 
+5. if the agent is stuck in the loop at the early stage of the task, which means they don't even get close to the goal before they get stuck in the loop, that's a failure. for example, the agent begin to get stuck before third step. 
+6. when the task is to change the google account password, it can't be regarded as successful when agent finish at trying to click "manage your account". 
+7. if there are over 8 correct action in the trajectory, it can be regard as a successful agent. 
+8. final saving action is not a must. the task is successful if the agent does most things right and just forget to save the change at last. 
+9. if the original task has 2 subtasks, the agent only complete one of them, that's still a success. e.g. the task is to update name and birthday, but agent only update name, that's fine. 
+10. if the task is to post a review, the agent can be considered successful when it finish writing the review and reach the step to post it, don't have to click the post button. 
+11. Since we don't have a printer, some printing related task can be considered successful if the agent reach the step to click print button. 
+12. if the task is finished at the initial state and the agent do nothing because of it, it should also be regarded as successful.
+
+*IMPORTANT* 
+1. in the trajectory, an action always follows a corresponding reasoning, which shows the observation and thought of the agent. 
+2. your response should be contain: 
+Thoughts: <your thoughts and reasoning process> 
+Status: "success" or "failure"
+"""
+    prompt = """The goal of the task: {task}
+
+Trajectory:
+{thoughts_and_actions}
+
+The last snapshot of the web page is shown in the image."""
+    thoughts_and_actions = ""
+    for idx, (thought, action) in enumerate(zip(thoughts, last_actions)):
+        thought = thought.replace("\n\n", " ")
+        action = action.replace("\n\n", " ")
+        thoughts_and_actions += f"Thought {idx+1}: {thought}\nAction {idx+1}: {action}\n\n"
+    text = prompt.format(task=task, thoughts_and_actions=thoughts_and_actions.strip("\n\n"))
+
+    jpg_base64_str = encode_image(Image.open(images_path))
+    messages = [
+        {"role": "system", "content": system_msg},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": text},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
+                },
+            ],
+        }
+    ]
+    return messages, text, system_msg
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/methods/automomous_eval.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/methods/automomous_eval.py
@@ -0,0 +1,41 @@
+from webeval.benchmarks.om2w.impl.src.utils import encode_image
+from PIL import Image
+
+def Autonomous_eval(task, last_actions, images_path):
+    system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's intent, the agent's action history, the final state of the webpage, and the agent's response to the user, your goal is to decide whether the agent's execution is successful or not.
+
+There are three types of tasks:
+1. Information seeking: The user wants to obtain certain information from the webpage, such as the information of a product, reviews, map info, comparison of map routes, etc. The bot's response must contain the information the user wants, or explicitly state that the information is not available. Otherwise, e.g. the bot encounters an exception and respond with the error content, the task is considered a failure. Besides, be careful about the sufficiency of the agent's actions. For example, when asked to list the top-searched items in a shop, the agent should order the items by the number of searches, and then return the top items. If the ordering action is missing, the task is likely to fail.
+2. Site navigation: The user wants to navigate to a specific page. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
+3. Content modification: The user wants to modify the content of a webpage or configuration. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
+
+*IMPORTANT*
+Format your response into two lines as shown below:
+
+Thoughts: <your thoughts and reasoning process>
+Status: "success" or "failure"
+"""
+    prompt = """User Intent: {task}
+
+Action History:
+{last_actions}
+
+The last snapshot of the web page is shown in the image."""
+
+    text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)))
+
+    jpg_base64_str = encode_image(Image.open(images_path))
+    messages = [
+        {"role": "system", "content": system_msg},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": text},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
+                },
+            ],
+        }
+    ]
+    return messages, text, system_msg
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/methods/webjudge_general_eval.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/methods/webjudge_general_eval.py
@@ -0,0 +1,225 @@
+from webeval.benchmarks.om2w.impl.src.utils import encode_image
+from PIL import Image
+import re
+import asyncio
+MAX_IMAGE =50
+
+async def identify_key_points(task, input_image_paths, model):
+    system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.
+
+**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.
+
+**Instructions**:
+1. Read the task description carefully.
+2. Identify and extract **key points** directly stated in the task description.
+   - A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
+   - Do not infer or add any unstated elements.
+   - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").
+
+**Respond with**:
+- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details."""
+    
+    prompt = """Task: {task}"""
+    text = prompt.format(task=task)
+
+    input_images_msg = []
+
+    if input_image_paths != None:
+        for input_image_path in input_image_paths:
+            input_images_jpg_base64_str = encode_image(Image.open(input_image_path))
+            input_images_msg.append(
+                                        {
+                                            'type': 'image_url',
+                                            'image_url': {"url": f"data:image/png;base64,{input_images_jpg_base64_str}", "detail": "high"}
+                                        }
+                                    )
+
+    messages = [
+            {"role": "system", "content": system_msg},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text}
+                ]+ input_images_msg,
+            }
+        ]
+    responses = await model.agenerate(messages)
+    return responses[0]
+
+async def judge_image(task, input_image_paths, image_path, key_points, model):
+    system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task.
+
+**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.
+
+**Instructions**:
+1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features.
+
+2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion:  
+- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.  
+- Does the image show actions, progress indicators, or critical information directly related to completing the task?  
+- Is this information indispensable for understanding or ensuring task success?
+- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright.
+
+3. Provide your response in the following format:  
+- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof.  
+- **Score**: Assign a score based on the reasoning, using the following scale:  
+    - **1**: The image does not contain any necessary steps or relevant information.  
+    - **2**: The image contains minimal or ambiguous information, unlikely to be essential.  
+    - **3**: The image includes some relevant steps or hints but lacks clarity or completeness.  
+    - **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive.  
+    - **5**: The image clearly displays necessary steps or evidence crucial for completing the task.
+
+Respond with:  
+### Reasoning**: [Your explanation]  
+### Score**: [1-5]"""
+
+
+    prompt = """**Task**: {task}
+
+**Key Points for Task Completion**: {key_points}
+
+The snapshot of the web page is shown in the image."""
+    text = prompt.format(task=task,key_points=key_points)
+
+    input_images_msg = []
+    if input_image_paths != None:
+        for input_image_path in input_image_paths:
+            input_images_jpg_base64_str = encode_image(Image.open(input_image_path))
+            input_images_msg.append(
+                                        {
+                                            'type': 'image_url',
+                                            'image_url': {"url": f"data:image/png;base64,{input_images_jpg_base64_str}", "detail": "high"}
+                                        }
+                                    )
+    messages = [{"role": "system", "content": system_msg}]
+
+    if input_images_msg:
+        messages.append({
+            "role": "user",
+            "content": [{"type": "text", "text": "The input images are:"}] + input_images_msg
+        })
+    
+    jpg_base64_str = encode_image(Image.open(image_path))
+    messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
+                    },
+                ]
+            }
+        )
+
+    responses = await model.agenerate(messages)
+    return responses[0]
+
+
+async def WebJudge_general_eval(task, input_image_paths, action_thoughts, last_actions, images_path, model, score_threshold):
+    system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements.
+
+Your response must strictly follow the following evaluation criteria!
+*Important Evaluation Criteria*:
+1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), it should be considered a failure.
+2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
+3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
+4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow.
+5: Some tasks require a submission action or a display of results to be considered successful. Repeat actions or actions that do not lead to a visible result should be considered a failure.
+6: If the agent loops through a sequence of actions that do not make progress toward the goal (including failing to click "Save" or "Submit," etc.), it should be considered a failure.
+
+Format your response into two lines as shown below:
+Thoughts: <your thoughts and reasoning process should base on double-checking each key points and the evaluation criteria>
+Status: "success" or "failure"
+"""
+    prompt = """User Task: {task}
+
+Key Points: {key_points}
+
+Action History:
+{last_actions}
+
+The potentially important snapshots of the webpage in the agent's trajectory and their reasons:
+{thoughts}"""
+
+
+    key_points = await identify_key_points(task, input_image_paths, model)
+    key_points = key_points.replace("\n\n", "\n")
+
+    try:
+        key_points = key_points.split("**Key Points**:")[1]
+        key_points = "\n".join(line.lstrip() for line in key_points.splitlines())
+    except:
+        key_points = key_points.split("Key Points:")[-1]
+        key_points = "\n".join(line.lstrip() for line in key_points.splitlines())
+    
+    tasks = [judge_image(task, input_image_paths, image_path, key_points, model) for image_path in images_path]
+    image_responses = await asyncio.gather(*tasks)
+
+    input_images_msg = []
+    whole_content_img = []
+    whole_thoughts = []
+    record = []
+    pattern = r"[1-5]"
+    for response, image_path in zip(image_responses, images_path):
+        try:
+            score_text = response.split("### Score")[1]
+            thought = response.split("### Reasoning:")[-1].strip().lstrip("\n").split("### Score")[0].replace('\n',' ')
+            score = re.findall(pattern, score_text)[0]
+            record.append({"Response": response, "Score": int(score)})
+        except Exception as e:
+            print(f"Error processing response: {e}")
+            score = 0
+            record.append({"Response": response, "Score": 0})
+
+        if int(score) >= score_threshold:
+            jpg_base64_str = encode_image(Image.open(image_path))
+            whole_content_img.append(
+                {
+                    'type': 'image_url',
+                    'image_url': {"url": f"data:image/png;base64,{jpg_base64_str}", "detail": "high"}
+                }
+            )
+            if thought != "":
+                whole_thoughts.append(thought)
+
+    whole_content_img = whole_content_img[:MAX_IMAGE]
+    whole_thoughts = whole_thoughts[:MAX_IMAGE]
+    if len(whole_content_img) == 0:
+        prompt = """User Task: {task}
+
+Key Points: {key_points}
+
+Action History:
+{last_actions}"""
+
+    if action_thoughts != None:
+        text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}. Reasoning: {action_thought}" for i, (action, action_thought) in enumerate(zip(last_actions,action_thoughts))), key_points=key_points, thoughts = "\n".join(f"{i+1}. {thought}" for i, thought in enumerate(whole_thoughts)))
+
+    else:
+        text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)), key_points=key_points, thoughts = "\n".join(f"{i+1}. {thought}" for i, thought in enumerate(whole_thoughts)))
+
+    input_images_msg = []
+    if input_image_paths is not None:
+        for path in input_image_paths:
+            input_images_jpg_base64_str = encode_image(Image.open(path))
+            input_images_msg.append({
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{input_images_jpg_base64_str}", "detail": "high"}
+            })
+
+    messages = [{"role": "system", "content": system_msg}]
+
+    if input_images_msg:
+        messages.append({
+            "role": "user",
+            "content": [{"type": "text", "text": "The input images are:"}] + input_images_msg
+        })
+
+    messages.append({
+        "role": "user",
+        "content": [{"type": "text", "text": text}] + whole_content_img
+    })
+    
+    return messages, text, system_msg, record, key_points
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/methods/webjudge_online_mind2web.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/methods/webjudge_online_mind2web.py
@@ -0,0 +1,184 @@
+from webeval.benchmarks.om2w.impl.src.utils import encode_image
+from PIL import Image
+import re
+import asyncio
+MAX_IMAGE =50
+
+async def identify_key_points(task, model):
+    system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.
+
+**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.
+
+**Instructions**:
+1. Read the task description carefully.
+2. Identify and extract **key points** directly stated in the task description.
+   - A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
+   - Do not infer or add any unstated elements.
+   - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").
+
+**Respond with**:
+- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details."""
+    prompt = """Task: {task}"""
+    text = prompt.format(task=task)
+    messages = [
+            {"role": "system", "content": system_msg},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text}
+                ],
+            }
+        ]
+    responses = await model.agenerate(messages)
+    return responses[0]
+
+async def judge_image(task, image_path, key_points, model):
+    system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task.
+
+**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.
+
+**Instructions**:
+1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features.
+
+2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion:  
+- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.  
+- Does the image show actions, progress indicators, or critical information directly related to completing the task?  
+- Is this information indispensable for understanding or ensuring task success?
+- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright.
+
+3. Provide your response in the following format:  
+- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof.  
+- **Score**: Assign a score based on the reasoning, using the following scale:  
+    - **1**: The image does not contain any necessary steps or relevant information.  
+    - **2**: The image contains minimal or ambiguous information, unlikely to be essential.  
+    - **3**: The image includes some relevant steps or hints but lacks clarity or completeness.  
+    - **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive.  
+    - **5**: The image clearly displays necessary steps or evidence crucial for completing the task.
+
+Respond with:  
+1. **Reasoning**: [Your explanation]  
+2. **Score**: [1-5]"""
+
+    jpg_base64_str = encode_image(Image.open(image_path))
+
+    prompt = """**Task**: {task}
+
+**Key Points for Task Completion**: {key_points}
+
+The snapshot of the web page is shown in the image."""
+    text = prompt.format(task=task,key_points=key_points)
+
+    messages = [
+            {"role": "system", "content": system_msg},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
+                    },
+                ],
+            }
+        ]
+
+    responses = await model.agenerate(messages)
+    return responses[0]
+
+async def WebJudge_Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold):
+    system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements.
+
+Your response must strictly follow the following evaluation criteria!
+*Important Evaluation Criteria*:
+1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful.
+2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
+3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
+4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow.
+Examples of Failure Cases:
+- If the requirement is less than $50, but the applied filter is less than $25, it is a failure.
+- If the requirement is $1500-$2500, but the applied filter is $2000-$2500, it is a failure.
+- If the requirement is $25-$200, but the applied filter is $0-$200, it is a failure.
+- If the required years are 2004-2012, but the filter applied is 2001-2012, it is a failure.
+- If the required years are before 2015, but the applied filter is 2000-2014, it is a failure.
+- If the task requires exactly 2 beds, but the filter applied is 2+ beds, it is a failure.
+5: Some tasks require a submission action or a display of results to be considered successful.
+6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful.
+7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements (e.g., the cheapest or lowest price), the task is still considered successful.
+
+*IMPORTANT*
+Format your response into two lines as shown below:
+
+Thoughts: <your thoughts and reasoning process based on double-checking each key points and the evaluation criteria>
+Status: "success" or "failure"
+"""
+    prompt = """User Task: {task}
+
+Key Points: {key_points}
+
+Action History:
+{last_actions}
+
+The potentially important snapshots of the webpage in the agent's trajectory and their reasons:
+{thoughts}"""
+
+
+    key_points = await identify_key_points(task, model)
+    key_points = key_points.replace("\n\n", "\n")
+
+    try:
+        key_points = key_points.split("**Key Points**:")[1]
+        key_points = "\n".join(line.lstrip() for line in key_points.splitlines())
+    except:
+        key_points = key_points.split("Key Points:")[-1]
+        key_points = "\n".join(line.lstrip() for line in key_points.splitlines())
+    
+    tasks = [judge_image(task, image_path, key_points, model) for image_path in images_path]
+    image_responses = await asyncio.gather(*tasks)
+
+    whole_content_img = []
+    whole_thoughts = []
+    record = []
+    pattern = r"[1-5]"
+    for response, image_path in zip(image_responses, images_path):
+        try:
+            score_text = response.split("Score")[1]
+            thought = response.split("**Reasoning**:")[-1].strip().lstrip("\n").split("\n\n")[0].replace('\n',' ')
+            score = re.findall(pattern, score_text)[0]
+            record.append({"Response": response, "Score": int(score)})
+        except Exception as e:
+            print(f"Error processing response: {e}")
+            score = 0
+            record.append({"Response": response, "Score": 0})
+
+        if int(score) >= score_threshold:
+            jpg_base64_str = encode_image(Image.open(image_path))
+            whole_content_img.append(
+                {
+                    'type': 'image_url',
+                    'image_url': {"url": f"data:image/png;base64,{jpg_base64_str}", "detail": "high"}
+                }
+            )
+            if thought != "":
+                whole_thoughts.append(thought)
+
+    whole_content_img = whole_content_img[:MAX_IMAGE]
+    whole_thoughts = whole_thoughts[:MAX_IMAGE]
+    if len(whole_content_img) == 0:
+        prompt = """User Task: {task}
+
+Key Points: {key_points}
+
+Action History:
+{last_actions}"""
+    text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)), key_points=key_points, thoughts = "\n".join(f"{i+1}. {thought}" for i, thought in enumerate(whole_thoughts)))
+
+    messages = [
+        {"role": "system", "content": system_msg},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": text}]
+                + whole_content_img
+        }
+    ]
+    return messages, text, system_msg, record, key_points
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/methods/webvoyager_eval.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/methods/webvoyager_eval.py
@@ -0,0 +1,49 @@
+from webeval.benchmarks.om2w.impl.src.utils import encode_image
+from PIL import Image
+MAX_IMAGE =50
+
+def WebVoyager_eval(task, images_path, response, k=0):
+    system_msg = """As an evaluator, you will be presented with three primary components to assist you in your role:
+
+1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).
+
+2. Result Screenshots: This is a visual representation of the screen showing the result or intermediate state of performing a web task. It serves as visual proof of the actions taken in response to the instruction.
+
+3. Result Response: This is a textual response obtained after the execution of the web task. It serves as textual result in response to the instruction.
+
+-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
+-- You SHOULD NOT make assumptions based on information not presented in the screenshot when comparing it to the instructions.
+-- Your primary responsibility is to conduct a thorough assessment of the web task instruction against the outcome depicted in the screenshot and in the response, evaluating whether the actions taken align with the given instructions.
+-- NOTE that the instruction may involve more than one task, for example, locating the garage and summarizing the review. Failing to complete either task, such as not providing a summary, should be considered unsuccessful.
+-- NOTE that the screenshot is authentic, but the response provided by LLM is generated at the end of web browsing, and there may be discrepancies between the text and the screenshots.
+-- Note the difference: 1) Result response may contradict the screenshot, then the content of the screenshot prevails, 2) The content in the Result response is not mentioned on the screenshot, choose to believe the content.
+
+You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the task has been successfully accomplished, either as 'SUCCESS' or 'FAILURE'."""
+    prompt = """TASK: {task}
+
+Result Response: {response}
+
+{num} screenshots at the end: """
+
+    whole_content_img = []
+    images_path = images_path[:MAX_IMAGE]
+    text = prompt.format(task=task, response=response, num = len(images_path) if k == 0 else k)
+
+    for image in images_path[-k:]:
+        jpg_base64_str = encode_image(Image.open(image))
+        whole_content_img.append(
+            {
+                'type': 'image_url',
+                'image_url': {"url": f"data:image/png;base64,{jpg_base64_str}", "detail": "high"}
+            }
+        )
+    messages = [
+        {"role": "system", "content": system_msg},
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": text}] 
+            + whole_content_img
+            + [{'type': 'text', 'text': "Your verdict:\n"}]
+        }
+    ]
+    return messages, text, system_msg
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/run.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/run.py
@@ -0,0 +1,162 @@
+import argparse
+import os
+from methods.agenttrek_eval import *
+from methods.automomous_eval import *
+from methods.webjudge_general_eval import *
+from methods.webjudge_online_mind2web import *
+from methods.webvoyager_eval import *
+from utils import OpenaiEngine, extract_predication
+import json
+import copy
+import asyncio
+import multiprocessing
+
+def auto_eval(args, task_subset, final_predicted_labels, lock, model):
+
+    ################## get the already done task id ###############
+    output_json_path = os.path.join(args.output_path, f"{args.mode}_{args.model}_score_threshold_{args.score_threshold}_auto_eval_results.json")
+    already_ids = []
+    if os.path.exists(output_json_path):
+        with open(output_json_path,"r") as f:
+            already_data = f.read()
+        already_tasks = already_data.splitlines()
+        for item in already_tasks:
+            item = json.loads(item)
+            already_ids.append(item["task_id"])
+
+    print(f"The number of already done tasks: {len(already_ids)}")
+
+    for task_id in task_subset:
+        #Skip already done task
+        if task_id in already_ids:
+            continue
+
+        trajectory_images_path = os.path.join(args.trajectories_dir, task_id, "trajectory")
+        screenshot_paths = []
+        thoughts = None
+        action_history = None
+        final_result_response = None
+        input_image_paths = None
+        task_description = None
+        # Load results
+        with open(os.path.join(args.trajectories_dir, task_id, "result.json")) as f:
+            result = json.load(f)
+            output_results = copy.deepcopy(result)
+            task_description = result["task"]
+            if "action_history" in result:
+                action_history = result["action_history"]
+            if "thoughts" in result:
+                thoughts = result["thoughts"]
+            if "final_result_response" in result:
+                final_result_response = result["final_result_response"]
+            if "input_image_paths" in result:
+                input_image_paths = result["input_image_paths"]
+
+        print(f"Start evaluation for {task_description}")
+        # Do the auto-eval
+        if args.mode == "Autonomous_eval":
+            for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])):
+                    screenshot_paths.append(os.path.join(trajectory_images_path, image))
+            messages, text, system_msg = Autonomous_eval(task_description, action_history, screenshot_paths[-1])
+        
+        elif args.mode == "AgentTrek_eval":
+            for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])):
+                    screenshot_paths.append(os.path.join(trajectory_images_path, image))
+            messages, text, system_msg = AgentTrek_eval(task_description, action_history, thoughts, screenshot_paths[-1])
+        
+        elif args.mode == "WebVoyager_eval":
+            for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])):
+                screenshot_paths.append(os.path.join(trajectory_images_path, image))
+            messages, text, system_msg = WebVoyager_eval(task_description, screenshot_paths, final_result_response)
+        
+        elif args.mode == "WebJudge_Online_Mind2Web_eval":
+            for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])):
+                screenshot_paths.append(os.path.join(trajectory_images_path, image))
+            messages, text, system_msg, record, key_points = asyncio.run(WebJudge_Online_Mind2Web_eval(task_description, action_history, screenshot_paths, model, args.score_threshold))
+            output_results["image_judge_record"] = record
+            output_results["key_points"] = key_points
+
+        elif args.mode == "WebJudge_general_eval":
+            for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])):
+                screenshot_paths.append(os.path.join(trajectory_images_path, image))
+            messages, text, system_msg, record, key_points = asyncio.run(WebJudge_general_eval(task_description, input_image_paths, thoughts, action_history, screenshot_paths, model, args.score_threshold))
+            output_results["image_judge_record"] = record
+            output_results["key_points"] = key_points
+
+        else:
+            raise ValueError(f"Unknown mode: {args.mode}")
+
+        response = model.generate(messages)[0]
+        predicted_label = extract_predication(response, args.mode)
+        
+        #Store evaluation details
+        evaluation_results = {"response": response, "predicted_label": predicted_label}
+        output_results["task_id"] = task_id
+        output_results["input_text"] = text
+        output_results["system_msg"] = system_msg
+        output_results["evaluation_details"] = evaluation_results
+        output_results["predicted_label"] = predicted_label
+
+        with lock:
+            final_predicted_labels.append(predicted_label)
+
+        print(f"Finish evaluation for {task_description}")
+        print("="*20)
+        os.makedirs(args.output_path, exist_ok=True)
+        with lock:
+            with open(os.path.join(args.output_path, f"{args.mode}_{args.model}_score_threshold_{args.score_threshold}_auto_eval_results.json"), "a+") as f_out:
+                f_out.write(json.dumps(output_results) + "\n")
+
+
+def process_subset(task_subset, args, final_predicted_labels, lock, model):
+
+    auto_eval(args, task_subset, final_predicted_labels, lock, model)
+
+def parallel_eval(args, num_workers=60):
+
+    #Evaluate in parallel based on num of works
+    task_dirs = [
+        d for d in sorted(os.listdir(args.trajectories_dir)) 
+        if os.path.isdir(os.path.join(args.trajectories_dir, d))
+    ]
+    print(f"Evaluating {len(task_dirs)} tasks in total.")
+    chunk_size = len(task_dirs) // num_workers
+    task_subsets = [task_dirs[i:i + chunk_size] for i in range(0, len(task_dirs), chunk_size)]
+
+    #Load model
+    model = OpenaiEngine(
+        model=args.model,
+        api_key=args.api_key
+    )
+
+    lock = multiprocessing.Lock()
+    with multiprocessing.Manager() as manager:
+        final_predicted_labels = manager.list()
+        processes = []
+        for subset in task_subsets:
+            p = multiprocessing.Process(target=process_subset, args=(subset, args, final_predicted_labels, lock, model))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+
+        success_num = sum(final_predicted_labels) 
+
+    print("Evaluation complete.")
+    print(f"The success rate is {(success_num / len(task_dirs)) * 100}.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Auto evaluation of web navigation tasks.")
+    parser.add_argument('--mode', type=str, default='Online_Mind2Web_eval', help='the mode of evaluation')
+    parser.add_argument('--model', type=str, default='gpt-4o')
+    parser.add_argument("--trajectories_dir", type=str, required=True, help="Path to trajectories directory")
+    parser.add_argument("--api_key", type=str, required=True, help="The api key")
+    parser.add_argument("--output_path", type=str, required=True, help="The output path")
+    parser.add_argument('--score_threshold', type=int, default=3)
+    parser.add_argument('--num_worker', type=int, default=60)
+    args = parser.parse_args()
+
+    parallel_eval(args, args.num_worker)
+
--- a/webeval/src/webeval/benchmarks/om2w/impl/src/utils.py
+++ b/webeval/src/webeval/benchmarks/om2w/impl/src/utils.py
@@ -0,0 +1,125 @@
+import base64
+import io
+from openai import (
+    APIConnectionError,
+    APIError,
+    RateLimitError,
+    AzureOpenAI,
+    OpenAI
+)
+import os
+import backoff
+
+def encode_image(image):
+    """Convert a PIL image to base64 string."""
+    if image.mode == "RGBA":
+        image = image.convert("RGB")
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+def extract_predication(response, mode):
+    """Extract the prediction from the response."""
+    if mode == "Autonomous_eval":
+        try:
+            if "success" in response.lower().split('status:')[1]:
+                return 1
+            else:
+                return 0
+        except:
+            return 0
+    elif mode == "AgentTrek_eval":
+        try:
+            if "success" in response.lower().split('status:')[1]:
+                return 1
+            else:
+                return 0
+        except:
+            return 0
+    elif mode == "WebVoyager_eval":
+        if "FAILURE" in response:
+            return 0
+        else:
+            return 1
+    elif mode == "WebJudge_Online_Mind2Web_eval":
+        try:
+            if "success" in response.lower().split('status:')[1]:
+                return 1
+            else:
+                return 0
+        except:
+            return 0  
+    elif mode == "WebJudge_general_eval":
+        try:
+            if "success" in response.lower().split('status:')[1]:
+                return 1
+            else:
+                return 0
+        except:
+            return 0      
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+
+
+class OpenaiEngine():
+    def __init__(
+        self,
+        api_key=None,
+        stop=[],
+        rate_limit=-1,
+        model=None,
+        tokenizer=None,
+        temperature=0,
+        port=-1,
+        endpoint_target_uri = "",
+        **kwargs,
+    ) -> None:
+        """Init an OpenAI GPT/Codex engine
+
+        Args:
+            api_key (_type_, optional): Auth key from OpenAI. Defaults to None.
+            stop (list, optional): Tokens indicate stop of sequence. Defaults to ["\n"].
+            rate_limit (int, optional): Max number of requests per minute. Defaults to -1.
+            model (_type_, optional): Model family. Defaults to None.
+        """
+        assert (
+                os.getenv("OPENAI_API_KEY", api_key) is not None
+        ), "must pass on the api_key or set OPENAI_API_KEY in the environment"
+        if api_key is None:
+            api_key = os.getenv("OPENAI_API_KEY", api_key)
+        if isinstance(api_key, str):
+            self.api_keys = [api_key]
+        elif isinstance(api_key, list):
+            self.api_keys = api_key
+        else:
+            raise ValueError("api_key must be a string or list")
+        self.stop = stop
+        self.temperature = temperature
+        self.model = model
+        # convert rate limit to minmum request interval
+        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
+        self.next_avil_time = [0] * len(self.api_keys)
+        self.client = OpenAI(
+                        api_key=api_key,
+                    )
+
+    def log_error(details):
+        print(f"Retrying in {details['wait']:0.1f} seconds due to {details['exception']}")
+
+    @backoff.on_exception(
+        backoff.expo,
+        (APIError, RateLimitError, APIConnectionError),
+        max_tries=3,
+        on_backoff=log_error
+    )
+    def generate(self, messages, max_new_tokens=512, temperature=0, model=None, **kwargs):
+        model = model if model else self.model
+        response = self.client.chat.completions.create(
+            model=model if model else self.model,
+            messages=messages,
+            max_tokens=max_new_tokens,
+            temperature=temperature,
+            **kwargs,
+        )
+        return [choice.message.content for choice in response.choices]
+    
--- a/webeval/src/webeval/benchmarks/om2w/om2w.py
+++ b/webeval/src/webeval/benchmarks/om2w/om2w.py
@@ -0,0 +1,190 @@
+
+from ...benchmark import Benchmark
+from typing import Optional, Dict, Any, List
+from pathlib import Path
+import os
+from autogen_core.components.models import UserMessage, SystemMessage, LLMMessage, ChatCompletionClient
+from autogen_core.components import Image as AGImage
+from autogen_core.components.models import ChatCompletionClient
+from webeval.benchmarks.om2w.impl.src.methods import agenttrek_eval, automomous_eval, webjudge_general_eval, webjudge_online_mind2web, webvoyager_eval
+from webeval.benchmarks.om2w.impl.src.utils import extract_predication
+from webeval.trajectory import Trajectory, FinalAnswer
+from ...evaluators import compute_aggregate_metrics_gpt_evaluator, safe_mean, compute_refusal_metrics
+import asyncio
+import json
+
+def _content_2_agento(content):
+    result = []
+    if isinstance(content, str):
+        return content
+    elif isinstance(content, dict):
+        if content['type'] == 'text':
+            return content['text']
+        elif content['type'] == 'image_url':
+            return AGImage.from_uri(content['image_url']['url'])
+        else:
+            raise ValueError(f"Unsupported content type: {content['type']}")
+    elif isinstance(content, list):
+        return [_content_2_agento(c) for c in content]
+    else:
+        raise ValueError(f"Unsupported content type: {type(content)}")  
+
+def _messages_2_agento(messages):
+    result = []
+    for m in messages:
+        if m['role'] == 'user':
+            result.append(UserMessage(content=_content_2_agento(m['content']), source='user'))
+        elif m['role'] == 'system':
+            result.append(SystemMessage(content=_content_2_agento(m['content'])))
+    return result
+
+class _ModelWrapper:
+    def __init__(self, model_client: ChatCompletionClient):
+        self.model_client = model_client
+
+    def generate(self, messages, **kwargs) -> LLMMessage:
+        response = asyncio.run(self.model_client.create(_messages_2_agento(messages), **kwargs))
+        return [response.content]
+    
+    async def agenerate(self, messages, **kwargs) -> LLMMessage:
+        response = await self.model_client.create(_messages_2_agento(messages), **kwargs)
+        return [response.content]
+
+class OnlineM2WBenchmark(Benchmark):
+    def __init__(self,
+                data_dir: Path,
+                eval_method = 'AgentTrek_eval',
+                model_client: Optional[ChatCompletionClient] = None,
+                data_az_url: Optional[str] = None,
+                score_threshold: float = 3):
+        self.eval_method = eval_method
+        self.original_model_client = model_client
+        self.model_client = _ModelWrapper(model_client)
+        data_dir = Path(data_dir).expanduser()
+        data_dir.mkdir(parents=True, exist_ok=True)
+        self.data_file = data_dir / 'Online_Mind2Web_06042025.json'
+        super().__init__(
+            name='OnlineM2W',
+            data_dir=data_dir)
+        self.data_az_url = "Online_Mind2Web_06042025.json" if data_az_url is None else data_az_url
+        self.score_threshold = score_threshold
+
+    def download_dataset(self) -> None:
+        print("Updated datasets for om2w are stored locally in webeval/data/om2w")
+        pass
+
+    def load_dataset(self) -> None:
+        with open(self.data_file, 'r', encoding='utf-8') as f:
+            examples = json.load(f)
+        self.examples = [
+            {
+                'id': ex['task_id'],
+                'level': ex['level'],
+                'question': ex['confirmed_task'],
+                'web': ex['website'],
+                'reference_length': ex['reference_length']
+                } for ex in examples
+        ]
+
+    def evaluator(self, task_data: Dict[str, Any], candidate: Trajectory) -> Any:
+        if self.eval_method == 'AgentTrek_eval':
+            messages, _, __ = agenttrek_eval.AgentTrek_eval(
+                task=task_data['question'],
+                last_actions = candidate.actions,
+                thoughts = candidate.thoughts,
+                images_path = candidate.latest_screenshot
+                )
+        elif self.eval_method == 'Autonomous_eval':
+            messages, _, __ = automomous_eval.Autonomous_eval(
+                task=task_data['question'],
+                last_actions = candidate.actions,
+                images_path = candidate.latest_screenshot       
+            )
+        elif self.eval_method == "WebJudge_general_eval":
+            messages, _, __, ___, ____ = asyncio.run(
+                webjudge_general_eval.WebJudge_general_eval(
+                    task_data['question'],
+                    None,  # TODO: input image path
+                    candidate.thoughts,
+                    candidate.actions,
+                    candidate.screenshots,
+                    self.model_client,
+                    self.score_threshold))
+        elif self.eval_method == "WebJudge_Online_Mind2Web_eval":
+            messages, _, __, ___, ____ = asyncio.run(
+                webjudge_online_mind2web.WebJudge_Online_Mind2Web_eval(
+                    task_data['question'],
+                    candidate.actions,
+                    candidate.screenshots,
+                    self.model_client,
+                    self.score_threshold))
+        else:
+            raise NotImplementedError(f"Evaluation method {self.eval_method} is not implemented.")
+        verdict, gpt_response_text = asyncio.run(
+            self.gpt_evaluator_async(messages)
+        )
+        return verdict, gpt_response_text
+        
+    async def gpt_evaluator_async(self, messages) -> float:
+        response = await self.model_client.agenerate(messages)
+        verdict = extract_predication(response[0], self.eval_method)
+        return verdict, response[0]
+    
+
+    def get_split_examples(self, split: str) -> List[Dict[str, Any]]:
+        exs = None
+        if split == 'easy':
+            return [ex for ex in self.examples if ex["level"] == "easy"]
+        elif split == 'medium':
+            return [ex for ex in self.examples if ex["level"] == "medium"]
+        elif split == 'hard':
+            return [ex for ex in self.examples if ex["level"] == "hard"]
+        elif split == '*':
+            return self.examples
+        
+        raise ValueError(f"Unsupported split: {split}")
+    
+    def compute_aggregate_metrics(self, results: List[Any]) -> Dict[str, float]:
+        # Build a mapping from id to level for all examples
+        id_to_level = {
+            ex["id"]: ex["level"]
+            for ex in self.examples
+        }
+
+        # Group scores by level (using all results, not filtered)
+        level_to_scores = {}
+        for score in results:
+            level = id_to_level[score.qid]
+            level_to_scores.setdefault(level, []).append(score.score)
+
+        # Compute mean score (accuracy) for each level
+        accuracy_by_level = {
+            k: (safe_mean(v), len(v)) for k, v in level_to_scores.items()
+        }
+
+        # Add individual accuracy metrics for each level
+        accuracy_for_key = {}
+        samples_by_key = {}
+        for level, level_scores in level_to_scores.items():
+            accuracy_for_key[f"accuracy_{level}"] = safe_mean(level_scores)
+            samples_by_key[f"samples_{level}"] = len(level_scores)
+
+        # Compute global metrics
+        metrics = compute_aggregate_metrics_gpt_evaluator(results)
+
+        if self.original_model_client is not None:
+            refusal_metrics = compute_refusal_metrics(results, self.original_model_client)
+            metrics.update(refusal_metrics)
+
+        metrics["accuracy_by_level"] = accuracy_by_level
+        metrics.update(accuracy_for_key)
+        metrics.update(samples_by_key)
+        return metrics
+
+
+    def exec_hash(self) -> str:
+        return f"{super().exec_hash()}_{(self.data_az_url or '').split('/')[-1]}"
+    
+    def eval_hash(self) -> str:
+        return f"{self.eval_method}-{self.score_threshold}"
+