mirror of
https://github.com/microsoft/fara.git
synced 2026-06-10 02:54:01 +08:00
Three changes in one PR: 1. Remove webeval's dependency on ``autogen-core`` / ``autogen-ext``. All chat completion clients, message types, and the graceful-retry layer now live under ``webeval/src/webeval/oai_clients/`` — self-contained wrappers around openai / azure-identity. Install no longer needs the autogen submodule; just ``pip install -e .[vllm]`` then ``cd webeval; pip install -e .``. 2. Incorporate the initial (now-stale) WebTailBench benchmark into the codebase. ``webeval/src/webeval/benchmarks/webtailbench/`` + ``webeval/scripts/webtailbench.py``. Loader auto-downloads ``WebTailBench-v1-rubrics.tsv`` from ``huggingface.co/datasets/microsoft/WebTailBench`` and threads each task's published ``precomputed_rubric`` through to the verifier so rubrics never get regenerated. 3. Release the Universal Verifier (``MMRubricAgent``) as the official judge for WebTailBench. Multimodal, rubric-grounded, two-model ensemble (``gpt-5.2`` + ``o4-mini``) with per-criterion scoring, outcome verification, ambiguity / invalid-task classification, and first-point-of-failure analysis. ``webeval/scripts/verify_trajectories.py`` is a stand-alone parallel runner that re-scores any directory of webeval-shaped trajectories without touching the solver. Documentation: repo-root README ``Updates`` section + Reproducibility CLI block; ``webeval/README.md`` documents the Trajectory / FinalAnswer schema, the ``<no_answer>`` semantics, and per-benchmark score-file shape. Tests: 18 passing, 1 skipped (opt-in HF download). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
63 lines
1.0 KiB
TOML
63 lines
1.0 KiB
TOML
[build-system]
|
|
requires = ["hatchling"]
|
|
build-backend = "hatchling.build"
|
|
|
|
[project]
|
|
name = "webeval"
|
|
version = "0.0.1"
|
|
license = "MIT"
|
|
description = ''
|
|
readme = "README.md"
|
|
requires-python = ">=3.10"
|
|
keywords = []
|
|
classifiers = [
|
|
"Programming Language :: Python :: 3",
|
|
"License :: OSI Approved :: MIT License",
|
|
"Operating System :: OS Independent",
|
|
]
|
|
|
|
|
|
dependencies = [
|
|
"openai",
|
|
"playwright",
|
|
"docker",
|
|
"huggingface_hub",
|
|
"tabulate",
|
|
"azure-identity",
|
|
"pandas",
|
|
"scipy",
|
|
"pydantic",
|
|
"nest_asyncio",
|
|
"qwen-agent",
|
|
"vllm==0.10.0",
|
|
"torch==2.7.1",
|
|
"torchvision==0.22.1",
|
|
"torchaudio==2.7.1",
|
|
"transformers==4.53.3",
|
|
"backoff",
|
|
"joblib",
|
|
"mlflow",
|
|
"rpds-py",
|
|
"pillow",
|
|
"tiktoken"
|
|
]
|
|
|
|
|
|
|
|
[tool.uv]
|
|
dev-dependencies = [
|
|
"poethepoet",
|
|
"ruff"
|
|
]
|
|
|
|
[project.scripts]
|
|
webeval = "webeval.cli:main"
|
|
|
|
[tool.hatch.metadata]
|
|
allow-direct-references = true
|
|
|
|
[tool.poe.tasks]
|
|
fmt = "ruff format"
|
|
format.ref = "fmt"
|
|
lint = "ruff check"
|