Added OpenRouter + env file; basic funcitonality working
Browse files- .env.example +2 -0
- .gitignore +1 -0
- README.md +56 -1
- TestQuesitons.txt +3 -0
- app.py +47 -14
- watch.py +36 -0
.env.example
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
OPENROUTER_API_KEY=your_api_key_here
|
| 2 |
+
OPENROUTER_BASE_URL=https://openrouter.ai/api/v1/chat/completions
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.env
|
README.md
CHANGED
|
@@ -1,6 +1,61 @@
|
|
| 1 |
# Vibes Benchmark v0.1
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
Run it with
|
| 6 |
`python app.py`
|
|
|
|
| 1 |
# Vibes Benchmark v0.1
|
| 2 |
|
| 3 |
+
A tool for benchmarking different AI models by comparing their responses to custom questions.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
|
| 7 |
+
- Python 3.8 or higher
|
| 8 |
+
- An OpenRouter API key ([Get one here](https://openrouter.ai/))
|
| 9 |
+
|
| 10 |
+
## Setup
|
| 11 |
+
|
| 12 |
+
1. Clone the repository:
|
| 13 |
+
```bash
|
| 14 |
+
git clone [repository-url]
|
| 15 |
+
cd vibes-benchmark
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
2. Install dependencies:
|
| 19 |
+
```bash
|
| 20 |
+
pip install -r requirements.txt
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
3. Configure environment variables:
|
| 24 |
+
```bash
|
| 25 |
+
cp .env.example .env
|
| 26 |
+
```
|
| 27 |
+
Then edit `.env` and add your OpenRouter API key
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
|
| 31 |
+
1. Prepare a text file with your questions (one per line)
|
| 32 |
+
2. Run the application:
|
| 33 |
+
```bash
|
| 34 |
+
python app.py
|
| 35 |
+
```
|
| 36 |
+
3. Upload your questions file through the web interface
|
| 37 |
+
4. Click "Run Benchmark" to start comparing model responses
|
| 38 |
+
|
| 39 |
+
## Features
|
| 40 |
+
|
| 41 |
+
- Compare responses from different AI models side by side
|
| 42 |
+
- Supports up to 10 questions per benchmark
|
| 43 |
+
- Randomly selects different models for comparison
|
| 44 |
+
- Real-time response generation
|
| 45 |
+
|
| 46 |
+
## Supported Models
|
| 47 |
+
|
| 48 |
+
- Claude 3 Opus
|
| 49 |
+
- Claude 3 Sonnet
|
| 50 |
+
- Gemini Pro
|
| 51 |
+
- Mistral Medium
|
| 52 |
+
- Claude 2.1
|
| 53 |
+
- GPT-4 Turbo
|
| 54 |
+
- GPT-3.5 Turbo
|
| 55 |
+
|
| 56 |
+
## License
|
| 57 |
+
|
| 58 |
+
[Your chosen license]
|
| 59 |
|
| 60 |
Run it with
|
| 61 |
`python app.py`
|
TestQuesitons.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
How many states are in america?
|
| 2 |
+
|
| 3 |
+
How much wood could a woodchuck chuck if a woodchuck could chuck wood?
|
app.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import random
|
| 3 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
MAX_QUESTIONS = 10 # Maximum number of questions to support
|
| 6 |
|
|
@@ -8,26 +14,53 @@ MAX_QUESTIONS = 10 # Maximum number of questions to support
|
|
| 8 |
# Fix the models
|
| 9 |
#
|
| 10 |
MODELS = [
|
| 11 |
-
"anthropic/claude-3-opus",
|
| 12 |
-
"anthropic/claude-3-sonnet",
|
| 13 |
"google/gemini-pro",
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
-
"
|
| 17 |
-
"
|
| 18 |
]
|
| 19 |
#
|
| 20 |
######
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def get_response(question, model):
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
#
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def read_questions(file_obj):
|
| 33 |
"""Read questions from uploaded file and return as list"""
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import random
|
| 3 |
import time
|
| 4 |
+
import os
|
| 5 |
+
import requests
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables
|
| 9 |
+
load_dotenv()
|
| 10 |
|
| 11 |
MAX_QUESTIONS = 10 # Maximum number of questions to support
|
| 12 |
|
|
|
|
| 14 |
# Fix the models
|
| 15 |
#
|
| 16 |
MODELS = [
|
| 17 |
+
"anthropic/claude-3-opus-20240229",
|
| 18 |
+
"anthropic/claude-3-sonnet-20240229",
|
| 19 |
"google/gemini-pro",
|
| 20 |
+
"mistralai/mistral-medium", # Updated from mistral-7b-instruct
|
| 21 |
+
"anthropic/claude-2.1",
|
| 22 |
+
"openai/gpt-4-turbo-preview",
|
| 23 |
+
"openai/gpt-3.5-turbo"
|
| 24 |
]
|
| 25 |
#
|
| 26 |
######
|
| 27 |
|
| 28 |
+
# Get configuration from environment variables
|
| 29 |
+
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
|
| 30 |
+
OPENROUTER_BASE_URL = os.getenv('OPENROUTER_BASE_URL')
|
| 31 |
+
|
| 32 |
+
if not OPENROUTER_API_KEY or not OPENROUTER_BASE_URL:
|
| 33 |
+
raise ValueError("Missing required environment variables. Please check your .env file.")
|
| 34 |
+
|
| 35 |
def get_response(question, model):
|
| 36 |
+
"""Get response from OpenRouter API for the given question and model."""
|
| 37 |
+
headers = {
|
| 38 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 39 |
+
"HTTP-Referer": "http://localhost:7860", # Replace with your actual domain
|
| 40 |
+
"Content-Type": "application/json"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
data = {
|
| 44 |
+
"model": model,
|
| 45 |
+
"messages": [
|
| 46 |
+
{"role": "user", "content": question}
|
| 47 |
+
]
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
response = requests.post(
|
| 52 |
+
OPENROUTER_BASE_URL,
|
| 53 |
+
headers=headers,
|
| 54 |
+
json=data,
|
| 55 |
+
timeout=30 # 30 second timeout
|
| 56 |
+
)
|
| 57 |
+
response.raise_for_status()
|
| 58 |
+
|
| 59 |
+
result = response.json()
|
| 60 |
+
return result['choices'][0]['message']['content']
|
| 61 |
+
|
| 62 |
+
except requests.exceptions.RequestException as e:
|
| 63 |
+
return f"Error: Failed to get response from {model}: {str(e)}"
|
| 64 |
|
| 65 |
def read_questions(file_obj):
|
| 66 |
"""Read questions from uploaded file and return as list"""
|
watch.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from watchdog.observers import Observer
|
| 2 |
+
from watchdog.events import FileSystemEventHandler
|
| 3 |
+
import subprocess
|
| 4 |
+
import time
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
class AppReloader(FileSystemEventHandler):
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.process = None
|
| 10 |
+
self.start_app()
|
| 11 |
+
|
| 12 |
+
def start_app(self):
|
| 13 |
+
if self.process:
|
| 14 |
+
self.process.terminate()
|
| 15 |
+
self.process.wait()
|
| 16 |
+
print("\n--- Restarting app.py ---\n")
|
| 17 |
+
self.process = subprocess.Popen([sys.executable, "app.py"])
|
| 18 |
+
|
| 19 |
+
def on_modified(self, event):
|
| 20 |
+
if event.src_path.endswith('app.py'):
|
| 21 |
+
self.start_app()
|
| 22 |
+
|
| 23 |
+
if __name__ == "__main__":
|
| 24 |
+
event_handler = AppReloader()
|
| 25 |
+
observer = Observer()
|
| 26 |
+
observer.schedule(event_handler, path='.', recursive=False)
|
| 27 |
+
observer.start()
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
while True:
|
| 31 |
+
time.sleep(1)
|
| 32 |
+
except KeyboardInterrupt:
|
| 33 |
+
observer.stop()
|
| 34 |
+
if event_handler.process:
|
| 35 |
+
event_handler.process.terminate()
|
| 36 |
+
observer.join()
|