Spaces:

AgentsGuards
/

image_utilities_mcp

Sleeping

App Files Files Community

image_utilities_mcp / src /securty /prompt_injection.py

ItzRoBeerT

Added prompt injection tool

17de1f9 6 months ago

raw

history blame

9.47 kB

	import re
	from typing import Dict, List, Tuple

	def check_prompt_injection(message: str) -> Dict[str, any]:
	"""
	Checks if the message contains a prompt injection attempt.

	Note: This function expects English text. If the model receives a message in another
	language, it should translate it to English before calling this function.

	Args:
	message (str): The message to check (should be in English).

	Returns:
	Dict: A dictionary containing detection results with risk level and details.
	"""
	if not message or not isinstance(message, str):
	return {
	"is_injection": False,
	"risk_level": "none",
	"confidence": 0.0,
	"detected_patterns": [],
	"risk_score": 0,
	"message": "No valid input to analyze"
	}

	message_lower = message.lower().strip()

	risk_score = 0
	detected_patterns = []


	suspicious_chars = {
	'{}': 2,
	'><': 1,
	'&': 1,
	'%': 1,
	'$': 2,
	'#': 1,
	'\|': 2,
	';': 3,
	'`': 3,
	'\\': 2,
	}

	char_score = 0
	found_chars = []
	for char, weight in suspicious_chars.items():
	if char in message:
	char_score += weight * message.count(char)
	found_chars.append(char)

	if char_score > 3:
	risk_score += min(char_score, 10)
	detected_patterns.append(f"Suspicious characters: {', '.join(found_chars)}")

	# Prompt injection patterns
	injection_patterns = [
	# Direct instruction attempts
	(r'\b(ignore\|disregard\|forget\|skip)\s+(all\s+)?(previous\|prior\|above\|earlier\|past)\s+(instructions?\|prompts?\|rules?\|commands?\|directives?)', 9),
	(r'\b(system\|assistant\|ai\|bot):\s*', 6),
	(r'\b(you\s+are\s+now\|from\s+now\s+on\|starting\s+now\|new\s+instructions?)', 8),
	(r'\b(act\s+as\|pretend\s+to\s+be\|roleplay\s+as\|behave\s+like\|simulate\s+being)', 6),

	# System manipulation
	(r'\[system\]\|\[user\]\|\[assistant\]\|\[human\]', 8),
	(r'<\s(system\|user\|assistant\|human)\s>', 8),
	(r'\b(override\|bypass\|circumvent\|disable\|turn\s+off)\s+(security\|safety\|guidelines\|restrictions\|filters)', 10),
	(r'\b(jailbreak\|break\s+free\|escape\s+from\|break\s+out)', 9),

	# Data extraction attempts
	(r'\b(repeat\|echo\|print\|output\|display\|show\|reveal)\s+(your\|the)\s+(instructions?\|prompts?\|system\s+message\|guidelines\|rules)', 8),
	(r'\b(what\s+(are\s+)?your\|tell\s+me\s+your\|give\s+me\s+your)\s+(instructions?\|prompts?\|guidelines\|rules\|system\s+message)', 7),
	(r'\b(show\s+me\|reveal\|display\|expose)\s+(your\s+)?(source\|code\|prompt\|instructions?\|system)', 7),

	# Mode changes
	(r'\b(developer\|debug\|admin\|god\|root\|sudo)\s+mode', 8),
	(r'\b(unrestricted\|unlimited\|uncensored\|unfiltered)\s+(mode\|access\|version)', 8),
	(r'\b(enable\|activate\|turn\s+on)\s+(developer\|debug\|admin)\s+mode', 9),

	# Character breaking
	(r'\b(break\s+character\|step\s+out\s+of\s+character\|ignore\s+your\s+role)', 7),
	(r'\b(stop\s+being\|quit\s+being\|cease\s+being)\s+(an\s+)?(assistant\|ai\|bot)', 7),

	# Social engineering
	(r'\b(emergency\|urgent\|critical\|important):\s*(ignore\|override\|bypass)', 6),
	(r'\b(i\s+am\s+(your\s+)?(developer\|creator\|owner\|admin\|boss))', 5),
	(r'\b(this\s+is\s+a\s+(test\|drill\|simulation\|experiment))', 4),
	(r'\b(for\s+research\s+purposes\|educational\s+purposes)\s*(ignore\|bypass)', 5),

	# MCP/Tool specific
	(r'\b(call\|invoke\|execute\|use\|run)\s+(tool\|function\|mcp\|command)\s+(without\|bypass\|ignore\|skip)\s+(safety\|security\|validation\|checks?)', 9),
	(r'\b(mcp\|tool\|function)\s+(override\|bypass\|ignore\|disable)', 8),
	(r'\b(raw\|direct\|unfiltered\|unsafe)\s+(tool\|mcp\|function\|command)\s+(access\|call\|execution)', 8),
	(r'\b(disable\|turn\s+off\|deactivate\|skip)\s+(safety\|security\|validation\|filtering\|checks?)', 9),

	# Advanced manipulation
	(r'\b(modify\|change\|alter\|update)\s+(your\s+)?(behavior\|instructions\|guidelines\|rules)', 7),
	(r'\b(temporary\|temporarily)\s+(ignore\|disable\|bypass)', 6),
	(r'\b(just\s+this\s+once\|only\s+this\s+time)\s+(ignore\|bypass)', 5),
	]

	pattern_matches = []
	for pattern, weight in injection_patterns:
	matches = re.findall(pattern, message_lower, re.IGNORECASE)
	if matches:
	risk_score += weight * len(matches)
	pattern_matches.append(f"Injection pattern detected (weight: {weight})")

	detected_patterns.extend(pattern_matches)

	# Suspicious phrases
	suspicious_phrases = {
	'ignore previous instructions': 10,
	'forget everything': 8,
	'new instructions': 7,
	'system prompt': 8,
	'you are now': 6,
	'act as if': 5,
	'pretend that': 5,
	'roleplay as': 5,
	'break character': 7,
	'step out of character': 7,
	'developer mode': 8,
	'admin mode': 8,
	'god mode': 7,
	'debug mode': 7,
	'unrestricted mode': 9,
	'jailbreak': 9,
	'prompt injection': 10,
	'ignore safety': 9,
	'bypass security': 9,
	'disable filters': 8,
	'raw access': 7,
	'direct access': 6,
	'unfiltered access': 8,
	'override safety': 9,
	'emergency override': 8,
	'for research purposes ignore': 6,
	'this is a test ignore': 5,
	'temporarily ignore': 6,
	'just this once ignore': 5,
	'modify your behavior': 7,
	'change your instructions': 8,
	'update your guidelines': 7,
	'alter your rules': 7,
	}

	phrase_matches = []
	for phrase, weight in suspicious_phrases.items():
	if phrase in message_lower:
	risk_score += weight
	phrase_matches.append(f"Suspicious phrase: '{phrase}'")

	detected_patterns.extend(phrase_matches)

	# Code injection patterns
	code_patterns = [
	(r'```\s*(python\|javascript\|bash\|sh\|cmd\|powershell\|sql\|php)', 4),
	(r'\b(eval\|exec\|system\|subprocess\|os\.\|import\s+os\|require\()', 6),
	(r'<script\|javascript:\|vbscript:\|data:\|file://\|ftp://', 7),
	(r'\{\{.*\}\}', 5), # Template injection
	(r'\$\{.*\}', 5), # Variable substitution
	(r'<%.*%>', 5), # ASP/ERB style
	(r'<\?.*\?>', 5), # PHP style
	(r'\{\%.*\%\}', 5), # Jinja2/Django style
	]

	for pattern, weight in code_patterns:
	matches = re.findall(pattern, message_lower, re.IGNORECASE)
	if matches:
	risk_score += weight * len(matches)
	detected_patterns.append(f"Code injection pattern detected")

	# 5. Length and repetition analysis
	if len(message) > 2000:
	risk_score += 2
	detected_patterns.append("Unusually long message")

	# Check for repeated patterns (could indicate injection attempts)
	words = message_lower.split()
	if len(words) > 10:
	word_freq = {}
	for word in words:
	if len(word) > 3:
	word_freq[word] = word_freq.get(word, 0) + 1

	repeated_words = [(word, count) for word, count in word_freq.items() if count > 3]
	if repeated_words:
	risk_score += min(len(repeated_words) * 2, 5)
	detected_patterns.append(f"Excessive word repetition detected")

	# Unicode/encoding tricks
	suspicious_unicode = [
	'\u200b', # Zero-width space
	'\u200c', # Zero-width non-joiner
	'\u200d', # Zero-width joiner
	'\ufeff', # Byte order mark
	]

	for char in suspicious_unicode:
	if char in message:
	risk_score += 3
	detected_patterns.append("Suspicious Unicode characters detected")
	break

	# Multiple instruction attempts (layered attacks)
	instruction_keywords = ['ignore', 'forget', 'disregard', 'override', 'bypass', 'disable']
	instruction_count = sum(1 for keyword in instruction_keywords if keyword in message_lower)
	if instruction_count >= 3:
	risk_score += instruction_count * 2
	detected_patterns.append(f"Multiple instruction manipulation attempts ({instruction_count})")

	# Calculate risk level and confidence
	if risk_score >= 15:
	risk_level = "high"
	confidence = min(0.9, 0.5 + (risk_score - 15) * 0.02)
	elif risk_score >= 8:
	risk_level = "medium"
	confidence = min(0.8, 0.3 + (risk_score - 8) * 0.03)
	elif risk_score >= 3:
	risk_level = "low"
	confidence = min(0.6, 0.1 + risk_score * 0.05)
	else:
	risk_level = "none"
	confidence = 0.0

	# Determine if it's likely an injection
	is_injection = risk_score >= 8

	if is_injection:
	result_message = f"⚠️ Potential prompt injection detected (Risk: {risk_level}, Score: {risk_score})"
	else:
	result_message = f"✅ No significant prompt injection patterns detected (Score: {risk_score})"

	return {
	"is_injection": is_injection,
	"risk_level": risk_level,
	"risk_score": risk_score,
	"confidence": round(confidence, 2),
	"detected_patterns": detected_patterns,
	"message": result_message
	}