only accept one rule as a solution, we select the first one. Do not allow groundings
Browse files- .gitignore +1 -0
- README.md +7 -4
- VerifiableRewardsForScalableLogicalReasoning.py +30 -1
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.idea
|
README.md
CHANGED
|
@@ -18,16 +18,19 @@ description: >-
|
|
| 18 |
|
| 19 |
# Metric Card for Symbolic Judge: Verifiable Rewards for Scalable Logical Reasoning
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
### How it Works
|
| 26 |
- **Input:** The symbolic judge takes as input a candidate hypothesis (logic rule) and an executable validation program containing background knowledge and examples.
|
| 27 |
- **Execution:** The candidate rule is executed against the validation program using a Prolog interpreter.
|
| 28 |
- **Correctness Criteria:** The rule is considered correct if it entails all positive examples and rejects all negative examples.
|
| 29 |
- **Metrics:** The symbolic judge computes a range of evaluation metrics (detailed below).
|
|
|
|
| 30 |
**Note:** A local Prolog interpreter is required to execute validation programs.
|
|
|
|
| 31 |
---
|
| 32 |
|
| 33 |
### Inputs
|
|
|
|
| 18 |
|
| 19 |
# Metric Card for Symbolic Judge: Verifiable Rewards for Scalable Logical Reasoning
|
| 20 |
|
| 21 |
+
This metric is part of the SLR framework (AIML-TUDA/SLR-Bench) and provides rewards for logical reasoning tasks.
|
| 22 |
+
THe reward model is grounded in the ILP (Inductive Logic Programming) paradigm, testing whether a given hypothesis (logic rule) solves a logical reasoning task.
|
| 23 |
+
TO check for entailment, the logic rule is executed against a set of background knowledge and examples, ensuring automatic evaluation that is verifiable, transparent, and reproducible.
|
| 24 |
+
|
| 25 |
+
|
| 26 |
### How it Works
|
| 27 |
- **Input:** The symbolic judge takes as input a candidate hypothesis (logic rule) and an executable validation program containing background knowledge and examples.
|
| 28 |
- **Execution:** The candidate rule is executed against the validation program using a Prolog interpreter.
|
| 29 |
- **Correctness Criteria:** The rule is considered correct if it entails all positive examples and rejects all negative examples.
|
| 30 |
- **Metrics:** The symbolic judge computes a range of evaluation metrics (detailed below).
|
| 31 |
+
|
| 32 |
**Note:** A local Prolog interpreter is required to execute validation programs.
|
| 33 |
+
|
| 34 |
---
|
| 35 |
|
| 36 |
### Inputs
|
VerifiableRewardsForScalableLogicalReasoning.py
CHANGED
|
@@ -100,13 +100,41 @@ Returns:
|
|
| 100 |
"""
|
| 101 |
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
| 104 |
"""
|
| 105 |
Evaluates a predicted rule against the validation program using Prolog.
|
| 106 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Extract configuration
|
| 108 |
positive_pred = eval_config.get("positive_predicate", "eastbound")
|
| 109 |
negative_pred = eval_config.get("negative_predicate", "westbound")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# extract predicate from rule_to_evaluate
|
| 111 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction)
|
| 112 |
if positive_pred not in rule_to_evaluate:
|
|
@@ -234,6 +262,7 @@ def extract_ilp_from_text_v2(text, target_predicates=None):
|
|
| 234 |
if not statement.endswith('.'):
|
| 235 |
statement += '.'
|
| 236 |
p_code += statement + '\n'
|
|
|
|
| 237 |
return p_code.strip() # Ensure no trailing whitespace
|
| 238 |
|
| 239 |
|
|
@@ -315,7 +344,7 @@ class VerifiableRewardsForScalableLogicalReasoning(evaluate.Metric):
|
|
| 315 |
eval_inputs.append((prediction, validation_program, eval_config))
|
| 316 |
|
| 317 |
# if more than 1k predictions, we use multiprocessing to speed up the evaluation
|
| 318 |
-
if len(eval_inputs) >
|
| 319 |
# Process evaluations in parallel
|
| 320 |
num_cpus = max(1, mp.cpu_count() - 1) # Leave one CPU free
|
| 321 |
with mp.Pool(processes=num_cpus) as pool:
|
|
|
|
| 100 |
"""
|
| 101 |
|
| 102 |
|
| 103 |
+
def validate_rule_no_hardcoded_cars(prediction):
|
| 104 |
+
"""Reject rules that hardcode specific car identifiers"""
|
| 105 |
+
import re
|
| 106 |
+
|
| 107 |
+
# Look for has_car with a constant (lowercase) in second position
|
| 108 |
+
hardcoded_pattern = r'has_car\([^,]+,\s*([a-z][a-z0-9_]*)\)'
|
| 109 |
+
matches = re.findall(hardcoded_pattern, prediction)
|
| 110 |
+
|
| 111 |
+
if matches:
|
| 112 |
+
return False, f"Rule contains ground cars: {matches[0]}"
|
| 113 |
+
|
| 114 |
+
return True, "Rule is valid"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
| 118 |
"""
|
| 119 |
Evaluates a predicted rule against the validation program using Prolog.
|
| 120 |
"""
|
| 121 |
+
is_valid, validation_msg = validate_rule_no_hardcoded_cars(prediction)
|
| 122 |
+
if not is_valid:
|
| 123 |
+
return {
|
| 124 |
+
"is_correct": False,
|
| 125 |
+
"partial_score": 0.0,
|
| 126 |
+
"syntax_valid": False,
|
| 127 |
+
"error": f"Rule validation failed: {validation_msg}"
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
# Extract configuration
|
| 131 |
positive_pred = eval_config.get("positive_predicate", "eastbound")
|
| 132 |
negative_pred = eval_config.get("negative_predicate", "westbound")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
validation_program = anonymize_entities(validation_program)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
# extract predicate from rule_to_evaluate
|
| 139 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction)
|
| 140 |
if positive_pred not in rule_to_evaluate:
|
|
|
|
| 262 |
if not statement.endswith('.'):
|
| 263 |
statement += '.'
|
| 264 |
p_code += statement + '\n'
|
| 265 |
+
print(p_code)
|
| 266 |
return p_code.strip() # Ensure no trailing whitespace
|
| 267 |
|
| 268 |
|
|
|
|
| 344 |
eval_inputs.append((prediction, validation_program, eval_config))
|
| 345 |
|
| 346 |
# if more than 1k predictions, we use multiprocessing to speed up the evaluation
|
| 347 |
+
if len(eval_inputs) > 500:
|
| 348 |
# Process evaluations in parallel
|
| 349 |
num_cpus = max(1, mp.cpu_count() - 1) # Leave one CPU free
|
| 350 |
with mp.Pool(processes=num_cpus) as pool:
|