Spaces:

mgbam
/

builder

Running

App Files Files Community

mgbam commited on Jul 23

Commit

53e6ab1

verified ·

1 Parent(s): c04089b

Rename extractor.py to search_replace.py

Browse files

Files changed (2) hide show

extractor.py +0 -55
search_replace.py +90 -0

extractor.py DELETED Viewed

@@ -1,55 +0,0 @@
-# /extractor.py
-""" Handles content extraction from various sources like files, images, and websites. """
-import mimetypes, os, re, logging
-from urllib.parse import urljoin
-import PyPDF2, docx, requests
-from bs4 import BeautifulSoup
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-try:
-    import cv2, pytesseract
-    OCR_AVAILABLE = True
-except ImportError:
-    OCR_AVAILABLE = False
-    logging.warning("OCR libraries not found. Text extraction from images will be disabled.")
-def extract_text_from_image(image_path: str) -> str:
-    if not OCR_AVAILABLE: return "Error: OCR dependencies not installed."
-    try:
-        image = cv2.imread(image_path)
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        return pytesseract.image_to_string(gray) or "No text found in image."
-    except Exception as e: return f"Error during OCR: {e}"
-def extract_text_from_file(file_path: str) -> str:
-    if not file_path: return ""
-    ext = os.path.splitext(file_path)[1].lower()
-    try:
-        if ext == ".pdf":
-            with open(file_path, "rb") as f: return "\n".join(p.extract_text() or "" for p in PyPDF2.PdfReader(f).pages)
-        elif ext == ".docx":
-            return "\n".join(p.text for p in docx.Document(file_path).paragraphs)
-        elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
-            with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read()
-        elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
-            return extract_text_from_image(file_path)
-        else: return f"Unsupported file type: {ext}"
-    except Exception as e: return f"Error extracting text: {e}"
-def extract_website_content(url: str) -> str:
-    try:
-        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
-        response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
-        response.raise_for_status()
-        response.encoding = response.apparent_encoding
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
-            for item in soup.find_all(tag):
-                if item.has_attr(attr): item[attr] = urljoin(url, item[attr])
-        body_content = str(soup)
-        if len(body_content) > 15000: body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
-        return f"<!-- Original URL: {url} -->\n{body_content}"
-    except Exception as e: return f"Error: Could not fetch content from {url}. Details: {e}"

search_replace.py ADDED Viewed

	@@ -0,0 +1,90 @@

+def apply_search_replace_changes(original_content: str, changes_text: str) -> str:
+    """Apply search/replace changes to content (HTML, Python, etc.)"""
+    if not changes_text.strip():
+        return original_content
+    blocks=[]
+    current_block=""
+    lines=changes_text.split('\n')
+    for line in lines:
+        if line.strip()==SEARCH_START:
+            if current_block.strip(): blocks.append(current_block.strip())
+            current_block=line+"\n"
+        elif line.strip()==REPLACE_END:
+            current_block+=line+"\n"
+            blocks.append(current_block.strip())
+            current_block=""
+        else:
+            current_block+=line+"\n"
+    if current_block.strip(): blocks.append(current_block.strip())
+    modified_content=original_content
+    for block in blocks:
+        lines=block.split('\n')
+        search_lines=[]; replace_lines=[]
+        in_search=False; in_replace=False
+        for ln in lines:
+            if ln.strip()==SEARCH_START:
+                in_search=True; in_replace=False
+            elif ln.strip()==DIVIDER:
+                in_search=False; in_replace=True
+            elif ln.strip()==REPLACE_END:
+                in_replace=False
+            elif in_search:
+                search_lines.append(ln)
+            elif in_replace:
+                replace_lines.append(ln)
+        if search_lines:
+            search_text='\n'.join(search_lines).strip()
+            replace_text='\n'.join(replace_lines).strip()
+            if search_text in modified_content:
+                modified_content=modified_content.replace(search_text,replace_text)
+            else:
+                print(f"Warning: Search text not found in content: {search_text[:100]}...")
+    return modified_content
+def apply_transformers_js_search_replace_changes(original_formatted_content: str, changes_text: str) -> str:
+    """Apply search/replace changes to transformers.js formatted content (three files)"""
+    if not changes_text.strip():
+        return original_formatted_content
+    files=parse_transformers_js_output(original_formatted_content)
+    blocks=[]; current_block=""
+    lines=changes_text.split('\n')
+    for line in lines:
+        if line.strip()==SEARCH_START:
+            if current_block.strip(): blocks.append(current_block.strip())
+            current_block=line+"\n"
+        elif line.strip()==REPLACE_END:
+            current_block+=line+"\n"
+            blocks.append(current_block.strip())
+            current_block=""
+        else:
+            current_block+=line+"\n"
+    if current_block.strip(): blocks.append(current_block.strip())
+    for block in blocks:
+        lines=block.split('\n')
+        search_lines=[]; replace_lines=[]
+        in_search=False; in_replace=False; target_file=None
+        for ln in lines:
+            if ln.strip()==SEARCH_START:
+                in_search=True; in_replace=False
+            elif ln.strip()==DIVIDER:
+                in_search=False; in_replace=True
+            elif ln.strip()==REPLACE_END:
+                in_replace=False
+            elif in_search:
+                search_lines.append(ln)
+            elif in_replace:
+                replace_lines.append(ln)
+        if search_lines:
+            search_text='\n'.join(search_lines).strip()
+            replace_text='\n'.join(replace_lines).strip()
+            if search_text in files['index.html']:
+                target_file='index.html'
+            elif search_text in files['index.js']:
+                target_file='index.js'
+            elif search_text in files['style.css']:
+                target_file='style.css'
+            if target_file and search_text in files[target_file]:
+                files[target_file]=files[target_file].replace(search_text,replace_text)
+            else:
+                print(f"Warning: Search text not found in any transformers.js file: {search_text[:100]}...")
+    return format_transformers_js_output(files)