File size: 10,368 Bytes
e4932aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
#!/usr/bin/env python3
"""
Run script for the simple integrated pipeline
Usage examples:
python run_simple.py sample_log.json
python run_simple.py /path/to/mordor_dataset/credential_access_log.json
python run_simple.py sample_log.json "Focus on lateral movement techniques"
"""
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import login as huggingface_login
# Add paths for imports
# We're in src/scripts/, so go up to project root
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
# Import the simple pipeline from src/full_pipeline/
try:
from src.full_pipeline.simple_pipeline import analyze_log_file
except ImportError as e:
print(f"Import error: {e}")
print("Make sure simple_pipeline.py is in src/full_pipeline/ directory")
print(f"Current working directory: {os.getcwd()}")
print(f"Script location: {Path(__file__).parent}")
sys.exit(1)
def setup_environment(model_name: str = "google_genai:gemini-2.0-flash"):
"""
Setup environment variables and check requirements.
Args:
model_name: Name of the model to validate environment for
"""
load_dotenv()
# Load environment variables
if os.getenv("GOOGLE_API_KEY"):
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
if os.getenv("GROQ_API_KEY"):
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
if os.getenv("OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
if os.getenv("HF_TOKEN"):
huggingface_login(token=os.getenv("HF_TOKEN"))
# Determine required environment variable based on model name
if "google_genai" in model_name or "gemini" in model_name:
required_env_var = "GOOGLE_API_KEY"
elif "groq" in model_name or "gpt-oss" in model_name or "llama" in model_name:
required_env_var = "GROQ_API_KEY"
elif "openai" in model_name or "gpt-" in model_name:
required_env_var = "OPENAI_API_KEY"
else:
print(
f"[WARNING] Unknown model '{model_name}', using default environment checks"
)
required_env_var = "GOOGLE_API_KEY"
if not os.getenv(required_env_var):
print(f"Error: {required_env_var} not found in environment variables")
print(f"Required for model: {model_name}")
print(f"Please set it in your .env file or environment.")
print("\nAvailable models and their requirements:")
print(" ✓ google_genai:gemini-2.0-flash: requires GOOGLE_API_KEY")
print(" ✓ google_genai:gemini-1.5-flash: requires GOOGLE_API_KEY")
print(" ✓ groq:gpt-oss-120b: requires GROQ_API_KEY")
print(" ✓ groq:gpt-oss-20b: requires GROQ_API_KEY")
print(" ✓ groq:llama-3.1-8b-instant: requires GROQ_API_KEY")
print(" ✓ groq:llama-3.3-70b-versatile: requires GROQ_API_KEY")
sys.exit(1)
print(f"Environment setup complete. Using {required_env_var} for {model_name}")
def validate_inputs(log_file: str):
"""Validate input parameters."""
if not os.path.exists(log_file):
print(f"Error: Log file not found: {log_file}")
# Suggest common locations - check from project root
os.chdir(project_root)
suggestions = []
if Path("mordor_dataset").exists():
suggestions.append("./mordor_dataset/")
if Path("../mordor_dataset").exists():
suggestions.append("../mordor_dataset/")
if suggestions:
print("Try looking in these directories:")
for suggestion in suggestions:
json_files = list(Path(suggestion).glob("*.json"))
if json_files:
print(f" {suggestion}")
for f in json_files[:3]: # Show first 3 files
print(f" - {f.name}")
if len(json_files) > 3:
print(f" ... and {len(json_files) - 3} more files")
sys.exit(1)
# Check if it's a JSON file
if not log_file.endswith(".json"):
print(f"Warning: File doesn't have .json extension: {log_file}")
response = input("Continue anyway? (y/n): ")
if response.lower() != "y":
sys.exit(1)
def main():
"""Main entry point."""
# Check arguments
if len(sys.argv) < 2:
print("Cybersecurity Log Analysis Pipeline")
print("=" * 50)
print("Usage: python run_simple_pipeline.py <log_file> [options]")
print("")
print("Arguments:")
print(" log_file Path to the log file to analyze")
print("")
print("Options:")
print(' --query "TEXT" Optional query for additional context')
print(
" --model MODEL_NAME Model to use for analysis (default: google_genai:gemini-2.0-flash)"
)
print(" --temp TEMPERATURE Temperature for model generation (default: 0.1)")
print(
" --output-dir DIR Output directory for results (default: mordor_dataset/eval_output)"
)
print("")
print("Examples:")
print(" python run_simple_pipeline.py sample_log.json")
print(
" python run_simple_pipeline.py mordor_dataset/datasets/credential_access.json"
)
print(
" python run_simple_pipeline.py sample.json --query 'Focus on privilege escalation'"
)
print(" python run_simple_pipeline.py sample.json --model gpt-oss-120b")
print(
" python run_simple_pipeline.py sample.json --model llama-3.1-8b-instant --temp 0.2"
)
print(" python run_simple_pipeline.py sample.json --output-dir custom_output")
print("")
print("Available models:")
print(" - google_genai:gemini-2.0-flash")
print(" - google_genai:gemini-1.5-flash")
print(" - groq:gpt-oss-120b")
print(" - groq:gpt-oss-20b")
print(" - groq:llama-3.1-8b-instant")
print(" - groq:llama-3.3-70b-versatile")
print("")
# Try to find sample files from project root
os.chdir(project_root)
sample_files = []
for pattern in ["*.json", "mordor_dataset/*.json", "../mordor_dataset/*.json"]:
sample_files.extend(Path(".").glob(pattern))
if sample_files:
print("Available log files found:")
for f in sample_files[:5]:
print(f" {f}")
if len(sample_files) > 5:
print(f" ... and {len(sample_files) - 5} more files")
sys.exit(1)
# Parse arguments
log_file = sys.argv[1]
query = None
model_name = "google_genai:gemini-2.0-flash"
temperature = 0.1
output_dir = "mordor_dataset/eval_output"
i = 2
while i < len(sys.argv):
if sys.argv[i] == "--query" and i + 1 < len(sys.argv):
query = sys.argv[i + 1]
i += 2
elif sys.argv[i] == "--model" and i + 1 < len(sys.argv):
model_name = sys.argv[i + 1]
i += 2
elif sys.argv[i] == "--temp" and i + 1 < len(sys.argv):
try:
temperature = float(sys.argv[i + 1])
except ValueError:
print(f"Error: Invalid temperature value: {sys.argv[i + 1]}")
sys.exit(1)
i += 2
elif sys.argv[i] == "--output-dir" and i + 1 < len(sys.argv):
output_dir = sys.argv[i + 1]
i += 2
else:
# Backward compatibility: treat as query if no flag
if not query:
query = sys.argv[i]
i += 1
print("Cybersecurity Multi-Agent Pipeline")
print("=" * 50)
print(f"Log file: {log_file}")
print(f"Model: {model_name}")
print(f"Temperature: {temperature}")
print(f"Output directory: {output_dir}")
print(f"User query: {query or 'None'}")
print("")
# Setup and validation
setup_environment(model_name)
validate_inputs(log_file)
# Run the pipeline
try:
print("Initializing pipeline...")
# Extract tactic from file path if it's in a subdirectory
tactic = None
log_path = Path(log_file)
if log_path.parent.name != "mordor_dataset":
tactic = log_path.parent.name
# Create subdirectories within the output directory
analysis_dir = os.path.join(output_dir, "analysis")
final_response_dir = os.path.join(output_dir, "final_response")
# Ensure output directories exist
os.makedirs(analysis_dir, exist_ok=True)
os.makedirs(final_response_dir, exist_ok=True)
final_state = analyze_log_file(
log_file,
query,
tactic,
model_name=model_name,
temperature=temperature,
log_agent_output_dir=analysis_dir,
response_agent_output_dir=final_response_dir,
)
print(final_state["markdown_report"])
print("\nPipeline execution completed successfully!")
except KeyboardInterrupt:
print("\nPipeline interrupted by user.")
sys.exit(0)
except Exception as e:
print(f"\nPipeline failed with error: {e}")
# Provide helpful debugging info
print("\nDebugging information:")
print(f" - Working directory: {os.getcwd()}")
print(f" - Log file exists: {os.path.exists(log_file)}")
print(f" - Python path: {sys.path[0]}")
# Check for common issues
if "knowledge base" in str(e).lower():
print("\nPossible solution:")
print(
" Make sure ./cyber_knowledge_base directory exists and is properly initialized"
)
elif "import" in str(e).lower():
print("\nPossible solution:")
print(
" Make sure you're running from the correct directory with access to src/"
)
sys.exit(1)
if __name__ == "__main__":
main() |