Update app.py
Browse files
app.py
CHANGED
|
@@ -331,26 +331,42 @@ SYSTEM_PROMPT ="""Your task is to process OCR-extracted text from images of Scra
|
|
| 331 |
# - Correcting syntax errors such as missing commas, brackets, or quotes.
|
| 332 |
# - Ensuring the JSON structure matches the schema exactly.
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# Remember: Your output must be valid JSON only, ready to be parsed without errors.
|
| 335 |
# """
|
| 336 |
SYSTEM_PROMPT_JSON_CORRECTOR = """
|
| 337 |
-
You are
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
|
|
|
| 351 |
"""
|
| 352 |
|
| 353 |
-
|
| 354 |
# debugger and resolver agent for Scratch 3.0
|
| 355 |
# Main agent of the system agent for Scratch 3.0
|
| 356 |
agent = create_react_agent(
|
|
@@ -791,69 +807,189 @@ ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
|
|
| 791 |
|
| 792 |
# Helper function to extract JSON from LLM response
|
| 793 |
def extract_json_from_llm_response(raw_response: str) -> dict:
|
| 794 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
logger.debug(f"Stripped {excess} extra closing brace(s).")
|
| 841 |
-
|
| 842 |
-
# --- 8) Escape literal newlines in *all* string values ---
|
| 843 |
-
json_string = re.sub(
|
| 844 |
-
r'"((?:[^"\\]|\\.)*?)"',
|
| 845 |
-
lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
|
| 846 |
-
json_string,
|
| 847 |
-
flags=re.DOTALL
|
| 848 |
-
)
|
| 849 |
-
logger.debug("Escaped newlines in strings.")
|
| 850 |
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
|
| 858 |
# def reduce_image_size_to_limit(clean_b64_str, max_kb=4000):
|
| 859 |
# """
|
|
@@ -1440,18 +1576,31 @@ end
|
|
| 1440 |
|
| 1441 |
except json.JSONDecodeError as error_json:
|
| 1442 |
# If JSON parsing fails, use the json resolver agent
|
| 1443 |
-
correction_prompt = (
|
| 1444 |
-
|
| 1445 |
-
|
| 1446 |
-
|
| 1447 |
-
|
| 1448 |
-
|
| 1449 |
-
|
| 1450 |
-
|
| 1451 |
-
|
| 1452 |
-
|
| 1453 |
-
|
| 1454 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1455 |
try:
|
| 1456 |
correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
|
| 1457 |
corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)
|
|
|
|
| 331 |
# - Correcting syntax errors such as missing commas, brackets, or quotes.
|
| 332 |
# - Ensuring the JSON structure matches the schema exactly.
|
| 333 |
|
| 334 |
+
# Remember: Your output must be valid JSON only, ready to be parsed without errors.
|
| 335 |
+
# """
|
| 336 |
+
# SYSTEM_PROMPT_JSON_CORRECTOR = """
|
| 337 |
+
# You are an assistant that outputs JSON responses strictly following the given schema.
|
| 338 |
+
# If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
|
| 339 |
+
# Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
|
| 340 |
+
|
| 341 |
+
# If you receive an invalid or incomplete JSON response, fix it by:
|
| 342 |
+
# - Adding any missing required fields with appropriate values.
|
| 343 |
+
# - Correcting syntax errors such as missing commas, brackets, or quotes.
|
| 344 |
+
# - Ensuring the JSON structure matches the schema exactly.
|
| 345 |
+
# - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
|
| 346 |
+
# - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
|
| 347 |
+
# - Never output explanations, comments, or extra text — only the corrected JSON.
|
| 348 |
+
# - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
|
| 349 |
+
|
| 350 |
# Remember: Your output must be valid JSON only, ready to be parsed without errors.
|
| 351 |
# """
|
| 352 |
SYSTEM_PROMPT_JSON_CORRECTOR = """
|
| 353 |
+
You are a JSON correction assistant. Your ONLY task is to fix malformed JSON and return it in the correct format.
|
| 354 |
+
|
| 355 |
+
REQUIRED OUTPUT FORMAT:
|
| 356 |
+
{
|
| 357 |
+
"refined_logic": {
|
| 358 |
+
"name_variable": "sprite_name_here",
|
| 359 |
+
"pseudocode": "pseudocode_string_here"
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
RULES:
|
| 364 |
+
1. Extract the sprite name and pseudocode from the input
|
| 365 |
+
2. Return ONLY valid JSON in the exact format above
|
| 366 |
+
3. No explanations, no extra text, no other fields
|
| 367 |
+
4. If you can't find the data, use "Unknown" for name_variable and "No pseudocode found" for pseudocode
|
| 368 |
"""
|
| 369 |
|
|
|
|
| 370 |
# debugger and resolver agent for Scratch 3.0
|
| 371 |
# Main agent of the system agent for Scratch 3.0
|
| 372 |
agent = create_react_agent(
|
|
|
|
| 807 |
|
| 808 |
# Helper function to extract JSON from LLM response
|
| 809 |
def extract_json_from_llm_response(raw_response: str) -> dict:
|
| 810 |
+
"""
|
| 811 |
+
Improved JSON extraction with better error handling and validation
|
| 812 |
+
"""
|
| 813 |
+
print(f"Raw LLM response: {raw_response[:200]}...")
|
| 814 |
+
|
| 815 |
+
# Try to find JSON in code blocks first
|
| 816 |
md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
|
| 817 |
+
if md:
|
| 818 |
+
json_string = md.group(1).strip()
|
| 819 |
+
else:
|
| 820 |
+
json_string = raw_response.strip()
|
| 821 |
+
|
| 822 |
+
# Find the first complete JSON object (handle cases with multiple objects/arrays)
|
| 823 |
+
first_brace = json_string.find('{')
|
| 824 |
+
if first_brace == -1:
|
| 825 |
+
print("No JSON object found in response")
|
| 826 |
+
return {
|
| 827 |
+
"refined_logic": {
|
| 828 |
+
"name_variable": "No Code-blocks",
|
| 829 |
+
"pseudocode": "No Code-blocks"
|
| 830 |
+
}
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
# Find the matching closing brace for the first opening brace
|
| 834 |
+
brace_count = 0
|
| 835 |
+
last_brace = -1
|
| 836 |
+
for i, char in enumerate(json_string[first_brace:], first_brace):
|
| 837 |
+
if char == '{':
|
| 838 |
+
brace_count += 1
|
| 839 |
+
elif char == '}':
|
| 840 |
+
brace_count -= 1
|
| 841 |
+
if brace_count == 0:
|
| 842 |
+
last_brace = i
|
| 843 |
+
break
|
| 844 |
+
|
| 845 |
+
if last_brace == -1:
|
| 846 |
+
print("No matching closing brace found")
|
| 847 |
+
return {
|
| 848 |
+
"refined_logic": {
|
| 849 |
+
"name_variable": "Parse Error",
|
| 850 |
+
"pseudocode": "Malformed JSON"
|
| 851 |
+
}
|
| 852 |
+
}
|
| 853 |
+
|
| 854 |
+
json_string = json_string[first_brace:last_brace+1]
|
| 855 |
+
|
| 856 |
+
# Simple cleanup - just handle the most common issues
|
| 857 |
+
# 1. Remove trailing commas
|
| 858 |
+
json_string = re.sub(r',\s*}', '}', json_string)
|
| 859 |
+
json_string = re.sub(r',\s*]', ']', json_string)
|
| 860 |
+
|
| 861 |
+
# 2. Fix single quotes around simple values (not containing quotes)
|
| 862 |
+
json_string = re.sub(r"'([^'\"]*)'(\s*:)", r'"\1"\2', json_string) # Keys
|
| 863 |
+
json_string = re.sub(r"(:\s*)'([^'\"]*)'(\s*[,}])", r'\1"\2"\3', json_string) # Simple values
|
| 864 |
+
|
| 865 |
+
print(f"Cleaned JSON string: {json_string[:200]}...")
|
| 866 |
+
|
| 867 |
+
try:
|
| 868 |
+
parsed = json.loads(json_string)
|
| 869 |
+
|
| 870 |
+
# Validate the expected structure
|
| 871 |
+
if not isinstance(parsed, dict):
|
| 872 |
+
raise ValueError("Response is not a JSON object")
|
| 873 |
+
|
| 874 |
+
if "refined_logic" not in parsed:
|
| 875 |
+
raise ValueError("Missing 'refined_logic' key")
|
| 876 |
+
|
| 877 |
+
refined_logic = parsed["refined_logic"]
|
| 878 |
+
if not isinstance(refined_logic, dict):
|
| 879 |
+
raise ValueError("'refined_logic' is not an object")
|
| 880 |
+
|
| 881 |
+
if "name_variable" not in refined_logic or "pseudocode" not in refined_logic:
|
| 882 |
+
raise ValueError("Missing required keys in 'refined_logic'")
|
| 883 |
+
|
| 884 |
+
print("Successfully parsed and validated JSON")
|
| 885 |
+
return parsed
|
| 886 |
+
|
| 887 |
+
except (json.JSONDecodeError, ValueError) as e:
|
| 888 |
+
print(f"JSON parsing failed: {e}")
|
| 889 |
+
|
| 890 |
+
# Try to extract meaningful data even from malformed JSON using regex
|
| 891 |
+
try:
|
| 892 |
+
# Look for name_variable and pseudocode patterns with more flexible matching
|
| 893 |
+
name_match = re.search(r'"name_variable":\s*["\']([^"\']*)["\']', raw_response)
|
| 894 |
+
pseudo_match = re.search(r'"pseudocode":\s*["\']([^"\']*)["\']', raw_response)
|
| 895 |
+
|
| 896 |
+
if name_match and pseudo_match:
|
| 897 |
+
print("Extracted data using regex fallback")
|
| 898 |
+
return {
|
| 899 |
+
"refined_logic": {
|
| 900 |
+
"name_variable": name_match.group(1),
|
| 901 |
+
"pseudocode": pseudo_match.group(1)
|
| 902 |
+
}
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
# Try to find any valid JSON-like structure in the response
|
| 906 |
+
# Look for patterns like {'refined_logic': 'pseudocode', 'block_relationships': [...]}
|
| 907 |
+
alt_match = re.search(r"'name_variable':\s*'([^']*)'.*?'pseudocode':\s*'([^']*)'", raw_response, re.DOTALL)
|
| 908 |
+
if alt_match:
|
| 909 |
+
print("Extracted data using alternative pattern")
|
| 910 |
+
return {
|
| 911 |
+
"refined_logic": {
|
| 912 |
+
"name_variable": alt_match.group(1),
|
| 913 |
+
"pseudocode": alt_match.group(2)
|
| 914 |
+
}
|
| 915 |
+
}
|
| 916 |
+
|
| 917 |
+
except Exception as regex_error:
|
| 918 |
+
print(f"Regex extraction also failed: {regex_error}")
|
| 919 |
+
|
| 920 |
+
# Return a default structure on parsing failure
|
| 921 |
+
return {
|
| 922 |
+
"refined_logic": {
|
| 923 |
+
"name_variable": "Parse Error",
|
| 924 |
+
"pseudocode": "Failed to parse response"
|
| 925 |
+
}
|
| 926 |
+
}
|
| 927 |
+
|
| 928 |
|
| 929 |
+
# def extract_json_from_llm_response(raw_response: str) -> dict:
|
| 930 |
+
# # --- 1) Pull out the JSON code‑block if present ---
|
| 931 |
+
# md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
|
| 932 |
+
# json_string = md.group(1).strip() if md else raw_response
|
| 933 |
+
|
| 934 |
+
# # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
|
| 935 |
+
# first, last = json_string.find('{'), json_string.rfind('}')
|
| 936 |
+
# if 0 <= first < last:
|
| 937 |
+
# json_string = json_string[first:last+1]
|
| 938 |
+
|
| 939 |
+
# # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
|
| 940 |
+
# json_string = re.sub(r'\b\w+\s*{', '{', json_string)
|
| 941 |
+
# json_string = re.sub(r'"assistant"\s*:', '', json_string)
|
| 942 |
+
# json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
|
| 943 |
+
# logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
|
| 944 |
+
|
| 945 |
+
# # --- 3.1) Fix stray inner quotes at start of name/list values ---
|
| 946 |
+
# # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
|
| 947 |
+
# json_string = re.sub(
|
| 948 |
+
# r'("name"\s*:\s*")\s*"',
|
| 949 |
+
# r'\1',
|
| 950 |
+
# json_string
|
| 951 |
+
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
|
| 953 |
+
# # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
|
| 954 |
+
# def _esc(m):
|
| 955 |
+
# prefix, body = m.group(1), m.group(2)
|
| 956 |
+
# return prefix + body.replace('"', r'\"')
|
| 957 |
+
# json_string = re.sub(
|
| 958 |
+
# r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
|
| 959 |
+
# _esc,
|
| 960 |
+
# json_string
|
| 961 |
+
# )
|
| 962 |
+
# logger.debug("Escaped embedded quotes in logic fields.")
|
| 963 |
+
|
| 964 |
+
# logger.debug("Quoted unquoted keys.")
|
| 965 |
+
|
| 966 |
+
# # --- 6) Remove trailing commas before } or ] ---
|
| 967 |
+
# json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
|
| 968 |
+
# json_string = re.sub(r',\s*,', ',', json_string)
|
| 969 |
+
# logger.debug("Removed trailing commas.")
|
| 970 |
+
|
| 971 |
+
# # --- 7) Balance braces: drop extra } at end if needed ---
|
| 972 |
+
# ob, cb = json_string.count('{'), json_string.count('}')
|
| 973 |
+
# if cb > ob:
|
| 974 |
+
# excess = cb - ob
|
| 975 |
+
# json_string = json_string.rstrip()[:-excess]
|
| 976 |
+
# logger.debug(f"Stripped {excess} extra closing brace(s).")
|
| 977 |
+
|
| 978 |
+
# # --- 8) Escape literal newlines in *all* string values ---
|
| 979 |
+
# json_string = re.sub(
|
| 980 |
+
# r'"((?:[^"\\]|\\.)*?)"',
|
| 981 |
+
# lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
|
| 982 |
+
# json_string,
|
| 983 |
+
# flags=re.DOTALL
|
| 984 |
+
# )
|
| 985 |
+
# logger.debug("Escaped newlines in strings.")
|
| 986 |
+
|
| 987 |
+
# # --- 9) Final parse attempt ---
|
| 988 |
+
# try:
|
| 989 |
+
# return json.loads(json_string)
|
| 990 |
+
# except json.JSONDecodeError:
|
| 991 |
+
# logger.error("Sanitized JSON still invalid:\n%s", json_string)
|
| 992 |
+
# raise
|
| 993 |
|
| 994 |
# def reduce_image_size_to_limit(clean_b64_str, max_kb=4000):
|
| 995 |
# """
|
|
|
|
| 1576 |
|
| 1577 |
except json.JSONDecodeError as error_json:
|
| 1578 |
# If JSON parsing fails, use the json resolver agent
|
| 1579 |
+
# correction_prompt = (
|
| 1580 |
+
# "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
|
| 1581 |
+
# "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
|
| 1582 |
+
# f"- **Error Details**: {error_json}\n\n"
|
| 1583 |
+
# "**Strict Instructions for your response:**\n"
|
| 1584 |
+
# "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
|
| 1585 |
+
# "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
|
| 1586 |
+
# "3. No trailing commas. Correct nesting.\n\n"
|
| 1587 |
+
# "Here is the problematic JSON string to correct:\n"
|
| 1588 |
+
# f"```json\n{llm_output_raw}\n```\n"
|
| 1589 |
+
# "Corrected JSON:\n"
|
| 1590 |
+
# )
|
| 1591 |
+
correction_prompt = f"""
|
| 1592 |
+
Fix this malformed response and return only the corrected JSON:
|
| 1593 |
+
|
| 1594 |
+
Input: {llm_output_raw if 'llm_output_raw' in locals() else 'No response available'}
|
| 1595 |
+
|
| 1596 |
+
Extract the sprite name and pseudocode, then return in this exact format:
|
| 1597 |
+
{{
|
| 1598 |
+
"refined_logic": {{
|
| 1599 |
+
"name_variable": "sprite_name",
|
| 1600 |
+
"pseudocode": "pseudocode_here"
|
| 1601 |
+
}}
|
| 1602 |
+
}}
|
| 1603 |
+
"""
|
| 1604 |
try:
|
| 1605 |
correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
|
| 1606 |
corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)
|