'
- # Display full contexts if requested
- if show_full and "full_contexts" in example and example["full_contexts"]:
- for context_item in example["full_contexts"]:
- content = context_item.get('content', '')
- abbreviated = context_item.get('abbreviatedContent', None)
-
- # Process the content
- processed = ContextProcessor.process_content(content, abbreviated)
-
- html_output += f'
{processed}
'
+ # Display full contexts or highlighted contexts based on toggle
+ if show_full:
+ # Show full context - directly use the strings from the list in full_contexts
+ if "full_contexts" in example and example["full_contexts"]:
+ for context_item in example["full_contexts"]:
+ if isinstance(context_item, dict) and 'content' in context_item:
+ content = context_item.get('content', '')
+ elif isinstance(context_item, str):
+ content = context_item
+ else:
+ content = str(context_item)
+
+ # Escape HTML entities for safe display
+ escaped_content = html.escape(content)
+
+ # Create the context item box - no headers
+ html_output += f'
{escaped_content}
'
else:
- # Display regular contexts if available
+ # Show highlighted contexts
if "contexts" in example and example["contexts"]:
for context_item in example["contexts"]:
- content = context_item.get('content', '')
- abbreviated = context_item.get('abbreviatedContent', None)
-
- # Process the content
- processed = ContextProcessor.process_content(content, abbreviated)
-
- is_primary = context_item.get('is_primary', False)
- extra_class = " primary-context" if is_primary else ""
-
- html_output += f''
-
- # Or process JSON-structured highlighted contexts
- elif "contexts_highlighted" in example and example["contexts_highlighted"]:
- processed_contexts = ContextProcessor.process_json_contexts(example["contexts_highlighted"])
-
- for context_item in processed_contexts:
- is_primary = context_item.get('is_primary', False)
- extra_class = " primary-context" if is_primary else ""
-
- html_output += f''
+ if isinstance(context_item, dict):
+ content = context_item.get('content', '')
+ is_primary = context_item.get('is_primary', False)
+
+ # Extra class for primary context styling
+ extra_class = " primary-context" if is_primary else ""
+
+ # Use content directly as it already has HTML highlighting
+ html_output += f''
+ elif isinstance(context_item, str):
+ # For direct string contexts
+ html_output += f'
{context_item}
'
else:
html_output += '
No context available. Try toggling to full context view.
'
diff --git a/utils/data_loader.py b/utils/data_loader.py
index 57f9d195c4c56895451282c694dc6ffb9557aa4e..c8fcc279f5efa5c482bdb4ba47514c0d54498c30 100644
--- a/utils/data_loader.py
+++ b/utils/data_loader.py
@@ -3,7 +3,6 @@ import json
import pandas as pd
import random
import re
-from .context_processor import process_highlights
# Global data store - loaded once at import time
_ARENA_DATA = None
@@ -40,10 +39,11 @@ def create_dummy_example():
return {
"question": "Could not load questions from the dataset. Please check the data file.",
"processed_context_desc": "Error: Data not available",
- "contexts": ["No context available"],
- "full_context": "Error loading context data.",
+ "contexts": [],
+ "full_contexts": [],
"Answerable": False,
- "insufficient": True
+ "insufficient": True,
+ "insufficient_reason": "Data loading error"
}
def get_random_example():
@@ -64,102 +64,113 @@ def get_random_example():
# Process the example data
processed_example = {
"question": example['question'],
- "processed_context_desc": example.get('processed_context_desc', ''),
- "Answerable": example.get('Answerable', True), # Default to True unless specified otherwise
+ "Answerable": not example.get('insufficient', False),
"insufficient": example.get('insufficient', False),
- "insufficient_reason": example.get('insufficient_reason', '')
+ "insufficient_reason": example.get('insufficient_reason', ''),
+ "sample_id": example.get('sample_id', 0)
}
- # Process contexts - for full context
+ # Process the context description - ensure it's a non-empty string
+ context_desc = example.get('processed_context_desc', '')
+ if pd.isna(context_desc):
+ context_desc = ""
+ # Add the description to the processed example
+ processed_example["processed_context_desc"] = context_desc
+
+ # Process full contexts - from the 'contexts' column
+ full_contexts = []
try:
- contexts_raw = example['contexts']
- if isinstance(contexts_raw, str):
- contexts = json.loads(contexts_raw)
- # Store full contexts as individual items
- full_contexts = []
- if isinstance(contexts, list):
- for i, chunk in enumerate(contexts):
- if isinstance(chunk, dict) and 'content' in chunk:
- full_contexts.append({
- 'chunk_num': i + 1,
- 'content': chunk.get('content', '')
- })
- processed_example["full_contexts"] = full_contexts
- else:
- processed_example["full_contexts"] = []
+ if 'contexts' in example and example['contexts']:
+ # Try to parse contexts as JSON if it's a string
+ contexts_str = example['contexts']
+
+ if isinstance(contexts_str, str):
+ # Try to parse as list literal first (for Python list representation)
+ if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'):
+ try:
+ # This is for handling Python list literals like "['string1', 'string2']"
+ import ast
+ contexts_list = ast.literal_eval(contexts_str)
+
+ # Process each context string in the list
+ for ctx in contexts_list:
+ full_contexts.append(ctx)
+ except (SyntaxError, ValueError) as e:
+ # If ast.literal_eval fails, try JSON
+ try:
+ contexts_list = json.loads(contexts_str)
+
+ # Process each context in the list
+ for ctx in contexts_list:
+ if isinstance(ctx, str):
+ full_contexts.append(ctx)
+ elif isinstance(ctx, dict) and 'content' in ctx:
+ full_contexts.append(ctx.get('content', ''))
+ except json.JSONDecodeError:
+ # Not valid JSON, treat as a single context
+ full_contexts.append(contexts_str)
+ else:
+ # Single context string (not JSON array or list literal)
+ full_contexts.append(contexts_str)
+ elif isinstance(contexts_str, list):
+ # Already a list, process directly
+ for ctx in contexts_str:
+ if isinstance(ctx, str):
+ full_contexts.append(ctx)
+ elif isinstance(ctx, dict) and 'content' in ctx:
+ full_contexts.append(ctx.get('content', ''))
except Exception as e:
- print(f"Error processing contexts: {e}")
- processed_example["full_contexts"] = []
+ print(f"Error processing full contexts: {e}")
- # Process highlighted contexts for display
+ # Process highlighted contexts - from contexts_highlighted column
contexts_highlighted = []
-
try:
- # Check if contexts_highlighted exists
+ # Process contexts_highlighted - this is stored as a string in CSV
if 'contexts_highlighted' in example and example['contexts_highlighted']:
- highlighted_contexts = []
+ highlights_str = example['contexts_highlighted']
- if isinstance(example['contexts_highlighted'], str):
+ if isinstance(highlights_str, str):
try:
- # Try direct JSON parsing first
- raw_str = example['contexts_highlighted']
-
- # First, manually parse the highlighted contexts using regex
- # This is a more robust approach for our specific format
- type_pattern = r'"type":\s*"(primary|secondary)"'
- content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
-
- types = re.findall(type_pattern, raw_str)
- # Handle both regular quotes and escaped quotes in content
- raw_contents = re.findall(content_pattern, raw_str)
+ # Try to parse as JSON array
+ highlights_list = json.loads(highlights_str)
- # Extract contents from tuple matches (the regex has capture groups)
- contents = []
- for match in raw_contents:
- # Get the non-empty string from the tuple
- content = next((s for s in match if s), "")
- contents.append(content)
-
- # Create the highlighted contexts from extracted data
- for i, (ctx_type, content) in enumerate(zip(types, contents)):
- highlighted_contexts.append({
- 'type': ctx_type,
- 'abbreviatedContent': content
- })
+ # Process each highlighted context
+ for i, ctx in enumerate(highlights_list):
+ if isinstance(ctx, dict):
+ ctx_type = ctx.get('type', 'secondary')
+ content = ctx.get('abbreviatedContent', '')
+
+ # The content already has HTML span tags for highlights
+ contexts_highlighted.append({
+ 'is_primary': ctx_type == 'primary',
+ 'content': content
+ })
+ except json.JSONDecodeError:
+ print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...")
+ elif isinstance(highlights_str, list):
+ # Already a list, process directly
+ for ctx in highlights_str:
+ if isinstance(ctx, dict):
+ ctx_type = ctx.get('type', 'secondary')
+ content = ctx.get('abbreviatedContent', '')
- except Exception as e:
- print(f"Error extracting contexts with regex: {e}")
- else:
- # Already an object, not a string
- highlighted_contexts = example['contexts_highlighted']
-
- # Process each context item
- for i, item in enumerate(highlighted_contexts):
- if isinstance(item, dict):
- ctx_type = item.get('type', 'secondary')
- content = item.get('abbreviatedContent', '')
-
- # Process highlights using the standard format
- content = process_highlights(content)
-
- contexts_highlighted.append({
- 'chunk_num': i + 1,
- 'content': content,
- 'is_primary': ctx_type == 'primary'
- })
+ contexts_highlighted.append({
+ 'is_primary': ctx_type == 'primary',
+ 'content': content
+ })
except Exception as e:
print(f"Error processing highlighted contexts: {e}")
- # If we couldn't process the highlighted contexts, fall back to the full contexts
- if not contexts_highlighted and processed_example["full_contexts"]:
- for i, ctx in enumerate(processed_example["full_contexts"]):
+ # Make sure we have the highlighted contexts populated even if there are no contexts_highlighted
+ if not contexts_highlighted and full_contexts:
+ for content in full_contexts:
contexts_highlighted.append({
- 'chunk_num': i + 1,
- 'content': ctx.get('content', ''),
- 'is_primary': False
+ 'is_primary': False,
+ 'content': content
})
processed_example["contexts"] = contexts_highlighted
+ processed_example["full_contexts"] = full_contexts
return processed_example