| import gradio as gr | |
| import evaluate | |
| l3score = evaluate.load("nhop/L3Score") | |
| def compute_l3score(api_key, provider, model, questions, predictions, references): | |
| try: | |
| result = l3score.compute( | |
| questions=[q.strip() for q in questions.split("\n") if q.strip()], | |
| predictions=[p.strip() for p in predictions.split("\n") if p.strip()], | |
| references=[r.strip() for r in references.split("\n") if r.strip()], | |
| api_key=api_key, | |
| provider=provider, | |
| model=model | |
| ) | |
| return result | |
| except Exception as e: | |
| return {"error": str(e)} | |
| with gr.Blocks() as demo: | |
| gr.Markdown(r""" | |
| # Metric: L3Score | |
| """) | |
| with gr.Row(): | |
| api_key = gr.Textbox(label="API Key", type="password") | |
| provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai") | |
| model = gr.Textbox(label="Model", value="gpt-4o-mini") | |
| with gr.Row(): | |
| questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?") | |
| predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris") | |
| references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris") | |
| compute_button = gr.Button("Compute L3Score") | |
| output = gr.JSON(label="L3Score Result") | |
| compute_button.click( | |
| fn=compute_l3score, | |
| inputs=[api_key, provider, model, questions, predictions, references], | |
| outputs=output | |
| ) | |
| gr.Markdown(r""" | |
| ## ๐ Description | |
| **L3Score** evaluates how semantically close a model-generated answer is to a reference answer for a given question. It prompts a **language model as a judge** using: | |
| ```text | |
| You are given a question, ground-truth answer, and a candidate answer. | |
| Question: {{question}} | |
| Ground-truth answer: {{gt}} | |
| Candidate answer: {{answer}} | |
| Is the semantic meaning of the ground-truth and candidate answers similar? | |
| Answer in one word - Yes or No. | |
| ``` | |
| The model's **log-probabilities** for "Yes" and "No" tokens are used to compute the score. | |
| ---""") | |
| gr.Markdown(""" ## ๐งฎ Scoring Logic""") | |
| gr.Markdown(r"""Let $l_{\text{yes}}$ and $l_{\text{no}}$ be the log-probabilities of 'Yes' and 'No', respectively.""",latex_delimiters=[ {"left": "$", "right": "$", "display": False }]) | |
| gr.Markdown(r""" | |
| - If neither token is in the top-5: | |
| $$ | |
| \text{L3Score} = 0 | |
| $$ | |
| - If both are present: | |
| $$ | |
| \text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})} | |
| $$ | |
| - If only one is present, the missing tokenโs probability is estimated using the minimum of: | |
| - remaining probability mass apart from the top-5 tokens | |
| - the least likely top-5 token | |
| The score ranges from 0 to 1, where 1 indicates the highest confidence by the LLM that the predicted and reference answers are semantically equivalent. | |
| See [SPIQA paper](https://arxiv.org/pdf/2407.09413) for details. | |
| --- | |
| ## ๐ How to Use | |
| ```python | |
| import evaluate | |
| l3score = evaluate.load("nhop/L3Score") | |
| questions = ["What is the capital of France?", "What is the capital of Germany?"] | |
| predictions = ["Paris", "Moscow"] | |
| references = ["Paris", "Berlin"] | |
| score = l3score.compute( | |
| questions=questions, | |
| predictions=predictions, | |
| references=references, | |
| api_key="your-openai-api-key", | |
| provider="openai", | |
| model="gpt-4o-mini" | |
| ) | |
| print(score) | |
| # {'L3Score': 0.49..., 'Cost': ...} | |
| ``` | |
| --- | |
| ## ๐ Inputs | |
| | Name | Type | Description | | |
| |--------------|--------------|-----------------------------------------------------------------------------| | |
| | `questions` | `list[str]` | The list of input questions. | | |
| | `predictions`| `list[str]` | Generated answers by the model being evaluated. | | |
| | `references` | `list[str]` | Ground-truth or reference answers. | | |
| | `api_key` | `str` | API key for the selected LLM provider. | | |
| | `provider` | `str` | Must support top-n token log-probabilities. **Default**: openai | | |
| | `model` | `str` | Name of the evaluation LLM. **Default**: gpt-4o-mini | | |
| ## ๐ Output | |
| Calling the `compute` method returns a dictionary containing the L3Score: | |
| ```python | |
| {"L3Score": float, "Cost": float} | |
| ``` | |
| The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls. | |
| --- | |
| ## ๐ Examples | |
| ```python | |
| l3score = evaluate.load("nhop/L3Score") | |
| score = l3score.compute( | |
| questions=["What is the capital of France?"], | |
| predictions=["Paris"], | |
| references=["Paris"], | |
| api_key="your-openai-api-key", | |
| provider="openai", | |
| model="gpt-4o-mini" | |
| ) | |
| # {'L3Score': 0.99..., 'Cost': ...} | |
| score = l3score.compute( | |
| questions=["What is the capital of Germany?"], | |
| predictions=["Moscow"], | |
| references=["Berlin"], | |
| api_key="your-openai-api-key", | |
| provider="openai", | |
| model="gpt-4o-mini" | |
| ) | |
| # {'L3Score': 0.00..., 'Cost': ...} | |
| ``` | |
| --- | |
| ## โ ๏ธ Limitations and Bias | |
| - Requires models that expose **top-n token log-probabilities** (e.g., OpenAI, DeepSeek, Groq). | |
| - Scores are **only comparable when using the same judge model**. | |
| ## ๐ Citation | |
| ```bibtex | |
| @article{pramanick2024spiqa, | |
| title={SPIQA: A Dataset for Multimodal Question Answering on Scientific Papers}, | |
| author={Pramanick, Shraman and Chellappa, Rama and Venugopalan, Subhashini}, | |
| journal={arXiv preprint arXiv:2407.09413}, | |
| year={2024} | |
| } | |
| ``` | |
| """) | |
| demo.launch() | |