Scratch_Vision_Game_test_dup

Sleeping

App Files Files Community

prthm11 commited on Sep 18

Commit

5e4aebd

verified ·

1 Parent(s): 78918c4

Update app.py

Browse files

Files changed (1) hide show

app.py +236 -87

app.py CHANGED Viewed

@@ -331,26 +331,42 @@ SYSTEM_PROMPT ="""Your task is to process OCR-extracted text from images of Scra
 # - Correcting syntax errors such as missing commas, brackets, or quotes.
 # - Ensuring the JSON structure matches the schema exactly.
 # Remember: Your output must be valid JSON only, ready to be parsed without errors.
 # """
 SYSTEM_PROMPT_JSON_CORRECTOR = """
-You are an assistant that outputs JSON responses strictly following the given schema.
-If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
-Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
-If you receive an invalid or incomplete JSON response, fix it by:
-- Adding any missing required fields with appropriate values.
-- Correcting syntax errors such as missing commas, brackets, or quotes.
-- Ensuring the JSON structure matches the schema exactly.
-- Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
-- Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
-- Never output explanations, comments, or extra text — only the corrected JSON.
-- **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
-Remember: Your output must be valid JSON only, ready to be parsed without errors.
 """
 # debugger and resolver agent for Scratch 3.0
 # Main agent of the system agent for Scratch 3.0
 agent = create_react_agent(
@@ -791,69 +807,189 @@ ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
 # Helper function to extract JSON from LLM response
 def extract_json_from_llm_response(raw_response: str) -> dict:
-    # --- 1) Pull out the JSON code‑block if present ---
     md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
-    json_string = md.group(1).strip() if md else raw_response
-    # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
-    first, last = json_string.find('{'), json_string.rfind('}')
-    if 0 <= first < last:
-        json_string = json_string[first:last+1]
-    # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
-    json_string = re.sub(r'\b\w+\s*{', '{', json_string)
-    json_string = re.sub(r'"assistant"\s*:', '', json_string)
-    json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
-    logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
-    # --- 3.1) Fix stray inner quotes at start of name/list values ---
-    # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
-    json_string = re.sub(
-        r'("name"\s*:\s*")\s*"',
-        r'\1',
-        json_string
-    )
-    # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
-    def _esc(m):
-        prefix, body = m.group(1), m.group(2)
-        return prefix + body.replace('"', r'\"')
-    json_string = re.sub(
-        r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
-        _esc,
-        json_string
-    )
-    logger.debug("Escaped embedded quotes in logic fields.")
-    logger.debug("Quoted unquoted keys.")
-    # --- 6) Remove trailing commas before } or ] ---
-    json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
-    json_string = re.sub(r',\s*,', ',', json_string)
-    logger.debug("Removed trailing commas.")
-    # --- 7) Balance braces: drop extra } at end if needed ---
-    ob, cb = json_string.count('{'), json_string.count('}')
-    if cb > ob:
-        excess = cb - ob
-        json_string = json_string.rstrip()[:-excess]
-        logger.debug(f"Stripped {excess} extra closing brace(s).")
-    # --- 8) Escape literal newlines in *all* string values ---
-    json_string = re.sub(
-        r'"((?:[^"\\]|\\.)*?)"',
-        lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
-        json_string,
-        flags=re.DOTALL
-    )
-    logger.debug("Escaped newlines in strings.")
-    # --- 9) Final parse attempt ---
-    try:
-        return json.loads(json_string)
-    except json.JSONDecodeError:
-        logger.error("Sanitized JSON still invalid:\n%s", json_string)
-        raise
 # def reduce_image_size_to_limit(clean_b64_str, max_kb=4000):
 #     """
@@ -1440,18 +1576,31 @@ end
     except json.JSONDecodeError as error_json:
         # If JSON parsing fails, use the json resolver agent
-        correction_prompt = (
-            "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
-            "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
-            f"- **Error Details**: {error_json}\n\n"
-            "**Strict Instructions for your response:**\n"
-            "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
-            "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
-            "3. No trailing commas. Correct nesting.\n\n"
-            "Here is the problematic JSON string to correct:\n"
-            f"```json\n{llm_output_raw}\n```\n"
-            "Corrected JSON:\n"
-        )
         try:
             correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
             corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)

 # - Correcting syntax errors such as missing commas, brackets, or quotes.
 # - Ensuring the JSON structure matches the schema exactly.
+# Remember: Your output must be valid JSON only, ready to be parsed without errors.
+# """
+# SYSTEM_PROMPT_JSON_CORRECTOR = """
+# You are an assistant that outputs JSON responses strictly following the given schema.
+# If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
+# Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
+# If you receive an invalid or incomplete JSON response, fix it by:
+# - Adding any missing required fields with appropriate values.
+# - Correcting syntax errors such as missing commas, brackets, or quotes.
+# - Ensuring the JSON structure matches the schema exactly.
+# - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
+# - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
+# - Never output explanations, comments, or extra text — only the corrected JSON.
+# - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
 # Remember: Your output must be valid JSON only, ready to be parsed without errors.
 # """
 SYSTEM_PROMPT_JSON_CORRECTOR = """
+You are a JSON correction assistant. Your ONLY task is to fix malformed JSON and return it in the correct format.
+REQUIRED OUTPUT FORMAT:
+{
+  "refined_logic": {
+    "name_variable": "sprite_name_here",
+    "pseudocode": "pseudocode_string_here"
+  }
+}
+RULES:
+1. Extract the sprite name and pseudocode from the input
+2. Return ONLY valid JSON in the exact format above
+3. No explanations, no extra text, no other fields
+4. If you can't find the data, use "Unknown" for name_variable and "No pseudocode found" for pseudocode
 """
 # debugger and resolver agent for Scratch 3.0
 # Main agent of the system agent for Scratch 3.0
 agent = create_react_agent(
 # Helper function to extract JSON from LLM response
 def extract_json_from_llm_response(raw_response: str) -> dict:
+    """
+    Improved JSON extraction with better error handling and validation
+    """
+    print(f"Raw LLM response: {raw_response[:200]}...")
+    # Try to find JSON in code blocks first
     md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
+    if md:
+        json_string = md.group(1).strip()
+    else:
+        json_string = raw_response.strip()
+    # Find the first complete JSON object (handle cases with multiple objects/arrays)
+    first_brace = json_string.find('{')
+    if first_brace == -1:
+        print("No JSON object found in response")
+        return {
+            "refined_logic": {
+                "name_variable": "No Code-blocks",
+                "pseudocode": "No Code-blocks"
+            }
+        }
+    # Find the matching closing brace for the first opening brace
+    brace_count = 0
+    last_brace = -1
+    for i, char in enumerate(json_string[first_brace:], first_brace):
+        if char == '{':
+            brace_count += 1
+        elif char == '}':
+            brace_count -= 1
+            if brace_count == 0:
+                last_brace = i
+                break
+    if last_brace == -1:
+        print("No matching closing brace found")
+        return {
+            "refined_logic": {
+                "name_variable": "Parse Error",
+                "pseudocode": "Malformed JSON"
+            }
+        }
+    json_string = json_string[first_brace:last_brace+1]
+    # Simple cleanup - just handle the most common issues
+    # 1. Remove trailing commas
+    json_string = re.sub(r',\s*}', '}', json_string)
+    json_string = re.sub(r',\s*]', ']', json_string)
+    # 2. Fix single quotes around simple values (not containing quotes)
+    json_string = re.sub(r"'([^'\"]*)'(\s*:)", r'"\1"\2', json_string)  # Keys
+    json_string = re.sub(r"(:\s*)'([^'\"]*)'(\s*[,}])", r'\1"\2"\3', json_string)  # Simple values
+    print(f"Cleaned JSON string: {json_string[:200]}...")
+    try:
+        parsed = json.loads(json_string)
+        # Validate the expected structure
+        if not isinstance(parsed, dict):
+            raise ValueError("Response is not a JSON object")
+        if "refined_logic" not in parsed:
+            raise ValueError("Missing 'refined_logic' key")
+        refined_logic = parsed["refined_logic"]
+        if not isinstance(refined_logic, dict):
+            raise ValueError("'refined_logic' is not an object")
+        if "name_variable" not in refined_logic or "pseudocode" not in refined_logic:
+            raise ValueError("Missing required keys in 'refined_logic'")
+        print("Successfully parsed and validated JSON")
+        return parsed
+    except (json.JSONDecodeError, ValueError) as e:
+        print(f"JSON parsing failed: {e}")
+        # Try to extract meaningful data even from malformed JSON using regex
+        try:
+            # Look for name_variable and pseudocode patterns with more flexible matching
+            name_match = re.search(r'"name_variable":\s*["\']([^"\']*)["\']', raw_response)
+            pseudo_match = re.search(r'"pseudocode":\s*["\']([^"\']*)["\']', raw_response)
+            if name_match and pseudo_match:
+                print("Extracted data using regex fallback")
+                return {
+                    "refined_logic": {
+                        "name_variable": name_match.group(1),
+                        "pseudocode": pseudo_match.group(1)
+                    }
+                }
+            # Try to find any valid JSON-like structure in the response
+            # Look for patterns like {'refined_logic': 'pseudocode', 'block_relationships': [...]}
+            alt_match = re.search(r"'name_variable':\s*'([^']*)'.*?'pseudocode':\s*'([^']*)'", raw_response, re.DOTALL)
+            if alt_match:
+                print("Extracted data using alternative pattern")
+                return {
+                    "refined_logic": {
+                        "name_variable": alt_match.group(1),
+                        "pseudocode": alt_match.group(2)
+                    }
+                }
+        except Exception as regex_error:
+            print(f"Regex extraction also failed: {regex_error}")
+        # Return a default structure on parsing failure
+        return {
+            "refined_logic": {
+                "name_variable": "Parse Error",
+                "pseudocode": "Failed to parse response"
+            }
+        }
+# def extract_json_from_llm_response(raw_response: str) -> dict:
+#     # --- 1) Pull out the JSON code‑block if present ---
+#     md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
+#     json_string = md.group(1).strip() if md else raw_response
+#     # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
+#     first, last = json_string.find('{'), json_string.rfind('}')
+#     if 0 <= first < last:
+#         json_string = json_string[first:last+1]
+#     # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
+#     json_string = re.sub(r'\b\w+\s*{', '{', json_string)
+#     json_string = re.sub(r'"assistant"\s*:', '', json_string)
+#     json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
+#     logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
+#     # --- 3.1) Fix stray inner quotes at start of name/list values ---
+#     # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
+#     json_string = re.sub(
+#         r'("name"\s*:\s*")\s*"',
+#         r'\1',
+#         json_string
+#     )
+#     # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
+#     def _esc(m):
+#         prefix, body = m.group(1), m.group(2)
+#         return prefix + body.replace('"', r'\"')
+#     json_string = re.sub(
+#         r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
+#         _esc,
+#         json_string
+#     )
+#     logger.debug("Escaped embedded quotes in logic fields.")
+#     logger.debug("Quoted unquoted keys.")
+#     # --- 6) Remove trailing commas before } or ] ---
+#     json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
+#     json_string = re.sub(r',\s*,', ',', json_string)
+#     logger.debug("Removed trailing commas.")
+#     # --- 7) Balance braces: drop extra } at end if needed ---
+#     ob, cb = json_string.count('{'), json_string.count('}')
+#     if cb > ob:
+#         excess = cb - ob
+#         json_string = json_string.rstrip()[:-excess]
+#         logger.debug(f"Stripped {excess} extra closing brace(s).")
+#     # --- 8) Escape literal newlines in *all* string values ---
+#     json_string = re.sub(
+#         r'"((?:[^"\\]|\\.)*?)"',
+#         lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
+#         json_string,
+#         flags=re.DOTALL
+#     )
+#     logger.debug("Escaped newlines in strings.")
+#     # --- 9) Final parse attempt ---
+#     try:
+#         return json.loads(json_string)
+#     except json.JSONDecodeError:
+#         logger.error("Sanitized JSON still invalid:\n%s", json_string)
+#         raise
 # def reduce_image_size_to_limit(clean_b64_str, max_kb=4000):
 #     """
     except json.JSONDecodeError as error_json:
         # If JSON parsing fails, use the json resolver agent
+        # correction_prompt = (
+        #     "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
+        #     "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
+        #     f"- **Error Details**: {error_json}\n\n"
+        #     "**Strict Instructions for your response:**\n"
+        #     "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
+        #     "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
+        #     "3. No trailing commas. Correct nesting.\n\n"
+        #     "Here is the problematic JSON string to correct:\n"
+        #     f"```json\n{llm_output_raw}\n```\n"
+        #     "Corrected JSON:\n"
+        # )
+        correction_prompt = f"""
+Fix this malformed response and return only the corrected JSON:
+Input: {llm_output_raw if 'llm_output_raw' in locals() else 'No response available'}
+Extract the sprite name and pseudocode, then return in this exact format:
+{{
+  "refined_logic": {{
+    "name_variable": "sprite_name",
+    "pseudocode": "pseudocode_here"
+  }}
+}}
+"""
         try:
             correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
             corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)