Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

PhiRAG / advanced_rag_updated.py

alx-d

Upload folder using huggingface_hub

f840733 verified about 2 months ago

raw

history blame contribute delete

11 kB

	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	import datetime
	import functools
	import traceback
	from typing import List, Optional, Any, Dict, Tuple
	import csv
	import pandas as pd
	import tempfile
	import shutil
	import glob

	import torch
	import transformers
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from langchain_community.llms import HuggingFacePipeline

	# Other LangChain and community imports
	from langchain_community.document_loaders import OnlinePDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain_community.retrievers import BM25Retriever
	from langchain.embeddings.base import Embeddings
	from langchain.retrievers import EnsembleRetriever
	from langchain.prompts import ChatPromptTemplate
	from langchain.schema import StrOutputParser, Document
	from langchain_core.runnables import RunnableParallel, RunnableLambda
	from transformers.quantizers.auto import AutoQuantizationConfig
	import gradio as gr
	from pydantic import PrivateAttr
	import pydantic

	from langchain.llms.base import LLM
	from typing import Any, Optional, List
	import typing
	import time
	import re
	import requests
	from langchain.schema import Document
	from langchain_community.document_loaders import PyMuPDFLoader # Updated loader
	import tempfile
	import mimetypes
	import gc

	# Add batch processing helper functions
	def generate_parameter_values(min_val, max_val, num_values):
	"""Generate evenly spaced values between min and max"""
	if num_values == 1:
	return [min_val]
	step = (max_val - min_val) / (num_values - 1)
	return [min_val + (step * i) for i in range(num_values)]

	def process_batch_query(query, model_choice, max_tokens, param_configs, slider_values, job_id, use_history=True):
	"""Process a batch of queries with different parameter combinations"""
	results = []

	# Generate all parameter combinations
	temp_values = [slider_values['temperature']] if param_configs['temperature'] == "Constant" else generate_parameter_values(0.1, 1.0, int(param_configs['temperature'].split()[2]))
	top_p_values = [slider_values['top_p']] if param_configs['top_p'] == "Constant" else generate_parameter_values(0.1, 0.99, int(param_configs['top_p'].split()[2]))
	top_k_values = [slider_values['top_k']] if param_configs['top_k'] == "Constant" else generate_parameter_values(1, 100, int(param_configs['top_k'].split()[2]))
	bm25_values = [slider_values['bm25']] if param_configs['bm25'] == "Constant" else generate_parameter_values(0.0, 1.0, int(param_configs['bm25'].split()[2]))

	total_combinations = len(temp_values) * len(top_p_values) * len(top_k_values) * len(bm25_values)
	current = 0

	for temp in temp_values:
	for top_p in top_p_values:
	for top_k in top_k_values:
	for bm25 in bm25_values:
	current += 1
	try:
	# Update parameters
	rag_chain.temperature = temp
	rag_chain.top_p = top_p
	rag_chain.top_k = top_k
	rag_chain.bm25_weight = bm25
	rag_chain.faiss_weight = 1.0 - bm25

	# Update ensemble retriever
	rag_chain.ensemble_retriever = EnsembleRetriever(
	retrievers=[rag_chain.bm25_retriever, rag_chain.faiss_retriever],
	weights=[rag_chain.bm25_weight, rag_chain.faiss_weight]
	)

	# Process query
	response = rag_chain.elevated_rag_chain.invoke({"question": query})

	# Store response in history if enabled
	if use_history:
	trimmed_response = response[:1000] + ("..." if len(response) > 1000 else "")
	rag_chain.conversation_history.append({"query": query, "response": trimmed_response})

	# Format result
	result = {
	"Parameters": f"Temp: {temp:.2f}, Top-p: {top_p:.2f}, Top-k: {top_k}, BM25: {bm25:.2f}",
	"Response": response,
	"Progress": f"Query {current}/{total_combinations}"
	}
	results.append(result)

	except Exception as e:
	results.append({
	"Parameters": f"Temp: {temp:.2f}, Top-p: {top_p:.2f}, Top-k: {top_k}, BM25: {bm25:.2f}",
	"Response": f"Error: {str(e)}",
	"Progress": f"Query {current}/{total_combinations}"
	})

	# Format results with CSV file links - UPDATED TO PASS ADDITIONAL PARAMETERS
	formatted_results, csv_path = format_batch_result_files(
	results, job_id,
	embedding_model=getattr(rag_chain, 'embedding_model', 'unknown'),
	llm_model=model_choice,
	param_variations=param_configs
	)

	return (
	formatted_results,
	csv_path,
	f"Job ID: {job_id}",
	f"Input tokens: {count_tokens(query)}",
	f"Output tokens: {sum(count_tokens(r['Response']) for r in results)}"
	)

	# ... (rest of the file content would go here, but I'll focus on the specific functions that need updating)

	def create_csv_from_batch_results(results: List[Dict], job_id: str,
	embedding_model: str = None, llm_model: str = None,
	param_variations: Dict = None) -> str:
	"""Create a CSV file from batch query results and return the file path"""
	# Save CSV files in the current directory for HuggingFace Spaces compatibility

	# Create a descriptive filename
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

	# Extract short names for filename
	def get_short_name(full_name, prefix_length=2):
	"""Extract short name from full model name"""
	if not full_name:
	return "unknown"
	# Remove emojis and get the actual model name
	clean_name = full_name.split(" ", 1)[-1] if " " in full_name else full_name
	# Get first few characters and last few characters
	if len(clean_name) > 8:
	return clean_name[:4] + clean_name[-4:]
	return clean_name

	def get_param_variation_name(param_configs):
	"""Get the parameter that was varied"""
	if not param_configs:
	return "const"

	varied_params = []
	for param, config in param_configs.items():
	if config != "Constant":
	# Extract the number from "Whole range X values"
	if "values" in config:
	num_values = config.split()[2] if len(config.split()) > 2 else "X"
	varied_params.append(f"{param}_{num_values}")

	if not varied_params:
	return "const"
	return "_".join(varied_params)

	# Build filename components
	embedding_short = get_short_name(embedding_model) if embedding_model else "emb"
	llm_short = get_short_name(llm_model) if llm_model else "llm"
	param_short = get_param_variation_name(param_variations) if param_variations else "const"

	# Create filename: batch_embedding_llm_params_timestamp.csv
	csv_filename = f"batch_{embedding_short}_{llm_short}_{param_short}_{timestamp}.csv"
	csv_path = os.path.abspath(csv_filename)

	# Extract parameters and responses
	data = []
	start_time = time.time()
	for result in results:
	params = result["Parameters"]
	response = result["Response"]
	progress = result["Progress"]

	# Calculate elapsed time for this query
	current_time = time.time()
	elapsed_time = current_time - start_time

	# Extract individual parameter values
	temp = float(re.search(r"Temp: ([\d.]+)", params).group(1))
	top_p = float(re.search(r"Top-p: ([\d.]+)", params).group(1))
	top_k = int(re.search(r"Top-k: (\d+)", params).group(1))
	bm25 = float(re.search(r"BM25: ([\d.]+)", params).group(1))

	# Extract response components
	model_info = re.search(r"Model: (.*?)\n", response)
	model = model_info.group(1) if model_info else "Unknown"

	# Extract main answer (everything between the parameters and the token counts)
	answer_match = re.search(r"Model Parameters:.?\n\n(.?)\n\n---", response, re.DOTALL)
	main_answer = answer_match.group(1).strip() if answer_match else response

	# Extract token counts
	input_tokens = re.search(r"Input tokens: (\d+)", response)
	output_tokens = re.search(r"Output tokens: (\d+)", response)

	# Extract conversation history count
	conv_history = re.search(r"Conversation History: (\d+) conversation", response)

	data.append({
	"Temperature": temp,
	"Top-p": top_p,
	"Top-k": top_k,
	"BM25 Weight": bm25,
	"Model": model,
	"Main Answer": main_answer,
	"Input Tokens": input_tokens.group(1) if input_tokens else "N/A",
	"Output Tokens": output_tokens.group(1) if output_tokens else "N/A",
	"Conversation History": conv_history.group(1) if conv_history else "0",
	"Progress": progress,
	"Elapsed Time (s)": f"{elapsed_time:.2f}"
	})

	# Create DataFrame and save to CSV
	df = pd.DataFrame(data)
	df.to_csv(csv_path, index=False)

	return csv_path

	def format_batch_result_files(results: List[Dict], job_id: str,
	embedding_model: str = None, llm_model: str = None,
	param_variations: Dict = None) -> Tuple[str, str]:
	"""Format batch results with links to CSV files"""
	# Create CSV file with improved filename
	csv_path = create_csv_from_batch_results(results, job_id, embedding_model, llm_model, param_variations)

	# Format the results
	formatted_results = "### Batch Query Results\n\n"

	# Add the actual results
	for result in results:
	formatted_results += f"#### {result['Parameters']}\n"
	formatted_results += f"Progress: {result['Progress']}\n\n"
	formatted_results += f"{result['Response']}\n\n"
	formatted_results += "---\n\n"

	return formatted_results, csv_path