prthm11 commited on
Commit
5e4aebd
·
verified ·
1 Parent(s): 78918c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -87
app.py CHANGED
@@ -331,26 +331,42 @@ SYSTEM_PROMPT ="""Your task is to process OCR-extracted text from images of Scra
331
  # - Correcting syntax errors such as missing commas, brackets, or quotes.
332
  # - Ensuring the JSON structure matches the schema exactly.
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  # Remember: Your output must be valid JSON only, ready to be parsed without errors.
335
  # """
336
  SYSTEM_PROMPT_JSON_CORRECTOR = """
337
- You are an assistant that outputs JSON responses strictly following the given schema.
338
- If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
339
- Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
340
-
341
- If you receive an invalid or incomplete JSON response, fix it by:
342
- - Adding any missing required fields with appropriate values.
343
- - Correcting syntax errors such as missing commas, brackets, or quotes.
344
- - Ensuring the JSON structure matches the schema exactly.
345
- - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
346
- - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
347
- - Never output explanations, comments, or extra text — only the corrected JSON.
348
- - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
349
-
350
- Remember: Your output must be valid JSON only, ready to be parsed without errors.
 
351
  """
352
 
353
-
354
  # debugger and resolver agent for Scratch 3.0
355
  # Main agent of the system agent for Scratch 3.0
356
  agent = create_react_agent(
@@ -791,69 +807,189 @@ ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
791
 
792
  # Helper function to extract JSON from LLM response
793
  def extract_json_from_llm_response(raw_response: str) -> dict:
794
- # --- 1) Pull out the JSON code‑block if present ---
 
 
 
 
 
795
  md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
796
- json_string = md.group(1).strip() if md else raw_response
797
-
798
- # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
799
- first, last = json_string.find('{'), json_string.rfind('}')
800
- if 0 <= first < last:
801
- json_string = json_string[first:last+1]
802
-
803
- # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
804
- json_string = re.sub(r'\b\w+\s*{', '{', json_string)
805
- json_string = re.sub(r'"assistant"\s*:', '', json_string)
806
- json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
807
- logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
808
-
809
- # --- 3.1) Fix stray inner quotes at start of name/list values ---
810
- # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
811
- json_string = re.sub(
812
- r'("name"\s*:\s*")\s*"',
813
- r'\1',
814
- json_string
815
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
 
817
- # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
818
- def _esc(m):
819
- prefix, body = m.group(1), m.group(2)
820
- return prefix + body.replace('"', r'\"')
821
- json_string = re.sub(
822
- r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
823
- _esc,
824
- json_string
825
- )
826
- logger.debug("Escaped embedded quotes in logic fields.")
827
-
828
- logger.debug("Quoted unquoted keys.")
829
-
830
- # --- 6) Remove trailing commas before } or ] ---
831
- json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
832
- json_string = re.sub(r',\s*,', ',', json_string)
833
- logger.debug("Removed trailing commas.")
834
-
835
- # --- 7) Balance braces: drop extra } at end if needed ---
836
- ob, cb = json_string.count('{'), json_string.count('}')
837
- if cb > ob:
838
- excess = cb - ob
839
- json_string = json_string.rstrip()[:-excess]
840
- logger.debug(f"Stripped {excess} extra closing brace(s).")
841
-
842
- # --- 8) Escape literal newlines in *all* string values ---
843
- json_string = re.sub(
844
- r'"((?:[^"\\]|\\.)*?)"',
845
- lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
846
- json_string,
847
- flags=re.DOTALL
848
- )
849
- logger.debug("Escaped newlines in strings.")
850
 
851
- # --- 9) Final parse attempt ---
852
- try:
853
- return json.loads(json_string)
854
- except json.JSONDecodeError:
855
- logger.error("Sanitized JSON still invalid:\n%s", json_string)
856
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857
 
858
  # def reduce_image_size_to_limit(clean_b64_str, max_kb=4000):
859
  # """
@@ -1440,18 +1576,31 @@ end
1440
 
1441
  except json.JSONDecodeError as error_json:
1442
  # If JSON parsing fails, use the json resolver agent
1443
- correction_prompt = (
1444
- "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
1445
- "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
1446
- f"- **Error Details**: {error_json}\n\n"
1447
- "**Strict Instructions for your response:**\n"
1448
- "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
1449
- "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
1450
- "3. No trailing commas. Correct nesting.\n\n"
1451
- "Here is the problematic JSON string to correct:\n"
1452
- f"```json\n{llm_output_raw}\n```\n"
1453
- "Corrected JSON:\n"
1454
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
1455
  try:
1456
  correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
1457
  corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)
 
331
  # - Correcting syntax errors such as missing commas, brackets, or quotes.
332
  # - Ensuring the JSON structure matches the schema exactly.
333
 
334
+ # Remember: Your output must be valid JSON only, ready to be parsed without errors.
335
+ # """
336
+ # SYSTEM_PROMPT_JSON_CORRECTOR = """
337
+ # You are an assistant that outputs JSON responses strictly following the given schema.
338
+ # If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
339
+ # Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
340
+
341
+ # If you receive an invalid or incomplete JSON response, fix it by:
342
+ # - Adding any missing required fields with appropriate values.
343
+ # - Correcting syntax errors such as missing commas, brackets, or quotes.
344
+ # - Ensuring the JSON structure matches the schema exactly.
345
+ # - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
346
+ # - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
347
+ # - Never output explanations, comments, or extra text — only the corrected JSON.
348
+ # - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
349
+
350
  # Remember: Your output must be valid JSON only, ready to be parsed without errors.
351
  # """
352
  SYSTEM_PROMPT_JSON_CORRECTOR = """
353
+ You are a JSON correction assistant. Your ONLY task is to fix malformed JSON and return it in the correct format.
354
+
355
+ REQUIRED OUTPUT FORMAT:
356
+ {
357
+ "refined_logic": {
358
+ "name_variable": "sprite_name_here",
359
+ "pseudocode": "pseudocode_string_here"
360
+ }
361
+ }
362
+
363
+ RULES:
364
+ 1. Extract the sprite name and pseudocode from the input
365
+ 2. Return ONLY valid JSON in the exact format above
366
+ 3. No explanations, no extra text, no other fields
367
+ 4. If you can't find the data, use "Unknown" for name_variable and "No pseudocode found" for pseudocode
368
  """
369
 
 
370
  # debugger and resolver agent for Scratch 3.0
371
  # Main agent of the system agent for Scratch 3.0
372
  agent = create_react_agent(
 
807
 
808
  # Helper function to extract JSON from LLM response
809
  def extract_json_from_llm_response(raw_response: str) -> dict:
810
+ """
811
+ Improved JSON extraction with better error handling and validation
812
+ """
813
+ print(f"Raw LLM response: {raw_response[:200]}...")
814
+
815
+ # Try to find JSON in code blocks first
816
  md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
817
+ if md:
818
+ json_string = md.group(1).strip()
819
+ else:
820
+ json_string = raw_response.strip()
821
+
822
+ # Find the first complete JSON object (handle cases with multiple objects/arrays)
823
+ first_brace = json_string.find('{')
824
+ if first_brace == -1:
825
+ print("No JSON object found in response")
826
+ return {
827
+ "refined_logic": {
828
+ "name_variable": "No Code-blocks",
829
+ "pseudocode": "No Code-blocks"
830
+ }
831
+ }
832
+
833
+ # Find the matching closing brace for the first opening brace
834
+ brace_count = 0
835
+ last_brace = -1
836
+ for i, char in enumerate(json_string[first_brace:], first_brace):
837
+ if char == '{':
838
+ brace_count += 1
839
+ elif char == '}':
840
+ brace_count -= 1
841
+ if brace_count == 0:
842
+ last_brace = i
843
+ break
844
+
845
+ if last_brace == -1:
846
+ print("No matching closing brace found")
847
+ return {
848
+ "refined_logic": {
849
+ "name_variable": "Parse Error",
850
+ "pseudocode": "Malformed JSON"
851
+ }
852
+ }
853
+
854
+ json_string = json_string[first_brace:last_brace+1]
855
+
856
+ # Simple cleanup - just handle the most common issues
857
+ # 1. Remove trailing commas
858
+ json_string = re.sub(r',\s*}', '}', json_string)
859
+ json_string = re.sub(r',\s*]', ']', json_string)
860
+
861
+ # 2. Fix single quotes around simple values (not containing quotes)
862
+ json_string = re.sub(r"'([^'\"]*)'(\s*:)", r'"\1"\2', json_string) # Keys
863
+ json_string = re.sub(r"(:\s*)'([^'\"]*)'(\s*[,}])", r'\1"\2"\3', json_string) # Simple values
864
+
865
+ print(f"Cleaned JSON string: {json_string[:200]}...")
866
+
867
+ try:
868
+ parsed = json.loads(json_string)
869
+
870
+ # Validate the expected structure
871
+ if not isinstance(parsed, dict):
872
+ raise ValueError("Response is not a JSON object")
873
+
874
+ if "refined_logic" not in parsed:
875
+ raise ValueError("Missing 'refined_logic' key")
876
+
877
+ refined_logic = parsed["refined_logic"]
878
+ if not isinstance(refined_logic, dict):
879
+ raise ValueError("'refined_logic' is not an object")
880
+
881
+ if "name_variable" not in refined_logic or "pseudocode" not in refined_logic:
882
+ raise ValueError("Missing required keys in 'refined_logic'")
883
+
884
+ print("Successfully parsed and validated JSON")
885
+ return parsed
886
+
887
+ except (json.JSONDecodeError, ValueError) as e:
888
+ print(f"JSON parsing failed: {e}")
889
+
890
+ # Try to extract meaningful data even from malformed JSON using regex
891
+ try:
892
+ # Look for name_variable and pseudocode patterns with more flexible matching
893
+ name_match = re.search(r'"name_variable":\s*["\']([^"\']*)["\']', raw_response)
894
+ pseudo_match = re.search(r'"pseudocode":\s*["\']([^"\']*)["\']', raw_response)
895
+
896
+ if name_match and pseudo_match:
897
+ print("Extracted data using regex fallback")
898
+ return {
899
+ "refined_logic": {
900
+ "name_variable": name_match.group(1),
901
+ "pseudocode": pseudo_match.group(1)
902
+ }
903
+ }
904
+
905
+ # Try to find any valid JSON-like structure in the response
906
+ # Look for patterns like {'refined_logic': 'pseudocode', 'block_relationships': [...]}
907
+ alt_match = re.search(r"'name_variable':\s*'([^']*)'.*?'pseudocode':\s*'([^']*)'", raw_response, re.DOTALL)
908
+ if alt_match:
909
+ print("Extracted data using alternative pattern")
910
+ return {
911
+ "refined_logic": {
912
+ "name_variable": alt_match.group(1),
913
+ "pseudocode": alt_match.group(2)
914
+ }
915
+ }
916
+
917
+ except Exception as regex_error:
918
+ print(f"Regex extraction also failed: {regex_error}")
919
+
920
+ # Return a default structure on parsing failure
921
+ return {
922
+ "refined_logic": {
923
+ "name_variable": "Parse Error",
924
+ "pseudocode": "Failed to parse response"
925
+ }
926
+ }
927
+
928
 
929
+ # def extract_json_from_llm_response(raw_response: str) -> dict:
930
+ # # --- 1) Pull out the JSON code‑block if present ---
931
+ # md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
932
+ # json_string = md.group(1).strip() if md else raw_response
933
+
934
+ # # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
935
+ # first, last = json_string.find('{'), json_string.rfind('}')
936
+ # if 0 <= first < last:
937
+ # json_string = json_string[first:last+1]
938
+
939
+ # # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
940
+ # json_string = re.sub(r'\b\w+\s*{', '{', json_string)
941
+ # json_string = re.sub(r'"assistant"\s*:', '', json_string)
942
+ # json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
943
+ # logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
944
+
945
+ # # --- 3.1) Fix stray inner quotes at start of name/list values ---
946
+ # # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
947
+ # json_string = re.sub(
948
+ # r'("name"\s*:\s*")\s*"',
949
+ # r'\1',
950
+ # json_string
951
+ # )
 
 
 
 
 
 
 
 
 
 
952
 
953
+ # # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
954
+ # def _esc(m):
955
+ # prefix, body = m.group(1), m.group(2)
956
+ # return prefix + body.replace('"', r'\"')
957
+ # json_string = re.sub(
958
+ # r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
959
+ # _esc,
960
+ # json_string
961
+ # )
962
+ # logger.debug("Escaped embedded quotes in logic fields.")
963
+
964
+ # logger.debug("Quoted unquoted keys.")
965
+
966
+ # # --- 6) Remove trailing commas before } or ] ---
967
+ # json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
968
+ # json_string = re.sub(r',\s*,', ',', json_string)
969
+ # logger.debug("Removed trailing commas.")
970
+
971
+ # # --- 7) Balance braces: drop extra } at end if needed ---
972
+ # ob, cb = json_string.count('{'), json_string.count('}')
973
+ # if cb > ob:
974
+ # excess = cb - ob
975
+ # json_string = json_string.rstrip()[:-excess]
976
+ # logger.debug(f"Stripped {excess} extra closing brace(s).")
977
+
978
+ # # --- 8) Escape literal newlines in *all* string values ---
979
+ # json_string = re.sub(
980
+ # r'"((?:[^"\\]|\\.)*?)"',
981
+ # lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
982
+ # json_string,
983
+ # flags=re.DOTALL
984
+ # )
985
+ # logger.debug("Escaped newlines in strings.")
986
+
987
+ # # --- 9) Final parse attempt ---
988
+ # try:
989
+ # return json.loads(json_string)
990
+ # except json.JSONDecodeError:
991
+ # logger.error("Sanitized JSON still invalid:\n%s", json_string)
992
+ # raise
993
 
994
  # def reduce_image_size_to_limit(clean_b64_str, max_kb=4000):
995
  # """
 
1576
 
1577
  except json.JSONDecodeError as error_json:
1578
  # If JSON parsing fails, use the json resolver agent
1579
+ # correction_prompt = (
1580
+ # "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
1581
+ # "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
1582
+ # f"- **Error Details**: {error_json}\n\n"
1583
+ # "**Strict Instructions for your response:**\n"
1584
+ # "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
1585
+ # "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
1586
+ # "3. No trailing commas. Correct nesting.\n\n"
1587
+ # "Here is the problematic JSON string to correct:\n"
1588
+ # f"```json\n{llm_output_raw}\n```\n"
1589
+ # "Corrected JSON:\n"
1590
+ # )
1591
+ correction_prompt = f"""
1592
+ Fix this malformed response and return only the corrected JSON:
1593
+
1594
+ Input: {llm_output_raw if 'llm_output_raw' in locals() else 'No response available'}
1595
+
1596
+ Extract the sprite name and pseudocode, then return in this exact format:
1597
+ {{
1598
+ "refined_logic": {{
1599
+ "name_variable": "sprite_name",
1600
+ "pseudocode": "pseudocode_here"
1601
+ }}
1602
+ }}
1603
+ """
1604
  try:
1605
  correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
1606
  corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)