| def extract_text_from_image(image_path): | |
| """Extract text from image using OCR""" | |
| try: | |
| try: | |
| pytesseract.get_tesseract_version() | |
| except Exception: | |
| return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions." | |
| image = cv2.imread(image_path) | |
| if image is None: | |
| return "Error: Could not read image file" | |
| image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB) | |
| gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY) | |
| _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) | |
| text=pytesseract.image_to_string(binary,config='--psm 6') | |
| return text.strip() if text.strip() else "No text found in image" | |
| except Exception as e: | |
| return f"Error extracting text from image: {e}" | |
| def extract_text_from_file(file_path): | |
| if not file_path: | |
| return "" | |
| mime,_=mimetypes.guess_type(file_path) | |
| ext=os.path.splitext(file_path)[1].lower() | |
| try: | |
| if ext==".pdf": | |
| with open(file_path,"rb") as f: | |
| reader=PyPDF2.PdfReader(f) | |
| return "\n".join(page.extract_text() or "" for page in reader.pages) | |
| elif ext in [".txt", ".md"]: | |
| with open(file_path,"r",encoding="utf-8") as f: | |
| return f.read() | |
| elif ext==".csv": | |
| with open(file_path,"r",encoding="utf-8") as f: | |
| return f.read() | |
| elif ext==".docx": | |
| doc=docx.Document(file_path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]: | |
| return extract_text_from_image(file_path) | |
| else: | |
| return "" | |
| except Exception as e: | |
| return f"Error extracting text: {e}" | |
| def extract_website_content(url: str) -> str: | |
| """Extract HTML code and content from a website URL""" | |
| try: | |
| parsed_url=urlparse(url) | |
| if not parsed_url.scheme: | |
| url="https://"+url | |
| parsed_url=urlparse(url) | |
| if not parsed_url.netloc: | |
| return "Error: Invalid URL provided" | |
| headers={ | |
| 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
| 'Accept-Language':'en-US,en;q=0.9', | |
| 'Accept-Encoding':'gzip, deflate, br', | |
| 'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1', | |
| 'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0' | |
| } | |
| session=requests.Session() | |
| session.headers.update(headers) | |
| max_retries=3 | |
| for attempt in range(max_retries): | |
| try: | |
| response=session.get(url,timeout=15,allow_redirects=True) | |
| response.raise_for_status() | |
| break | |
| except requests.exceptions.HTTPError as e: | |
| if e.response.status_code==403 and attempt<max_retries-1: | |
| session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' | |
| continue | |
| else: | |
| raise | |
| try: | |
| response.encoding=response.apparent_encoding | |
| raw_html=response.text | |
| except: | |
| raw_html=response.content.decode('utf-8',errors='ignore') | |
| if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'): | |
| try: | |
| raw_html=response.content.decode('latin-1',errors='ignore') | |
| except: | |
| try: | |
| raw_html=response.content.decode('utf-8',errors='ignore') | |
| except: | |
| raw_html=response.content.decode('cp1252',errors='ignore') | |
| soup=BeautifulSoup(raw_html,'html.parser') | |
| title=soup.find('title') | |
| title_text=title.get_text().strip() if title else "No title found" | |
| meta_desc=soup.find('meta',attrs={'name':'description'}) | |
| description=meta_desc.get('content','') if meta_desc else "" | |
| content_sections=[] | |
| main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body'] | |
| for selector in main_selectors: | |
| elements=soup.select(selector) | |
| for element in elements: | |
| text=element.get_text().strip() | |
| if len(text)>100: | |
| content_sections.append(text) | |
| nav_links=[] | |
| nav_elements=soup.find_all(['nav','header']) | |
| for nav in nav_elements: | |
| links=nav.find_all('a') | |
| for link in links: | |
| link_text=link.get_text().strip() | |
| link_href=link.get('href','') | |
| if link_text and link_href: | |
| nav_links.append(f"{link_text}: {link_href}") | |
| img_elements=soup.find_all('img') | |
| for img in img_elements: | |
| src=img.get('src','') | |
| if src: | |
| if src.startswith('//'): | |
| absolute_src='https:'+src | |
| img['src']=absolute_src | |
| elif src.startswith('/'): | |
| absolute_src=urljoin(url,src) | |
| img['src']=absolute_src | |
| elif not src.startswith(('http://','https://')): | |
| absolute_src=urljoin(url,src) | |
| img['src']=absolute_src | |
| data_src=img.get('data-src','') | |
| if data_src and not src: | |
| if data_src.startswith('//'): | |
| absolute_data_src='https:'+data_src | |
| img['src']=absolute_data_src | |
| elif data_src.startswith('/'): | |
| absolute_data_src=urljoin(url,data_src) | |
| img['src']=absolute_data_src | |
| elif not data_src.startswith(('http://','https://')): | |
| absolute_data_src=urljoin(url,data_src) | |
| img['src']=absolute_data_src | |
| else: | |
| img['src']=data_src | |
| elements_with_style=soup.find_all(attrs={'style':True}) | |
| for element in elements_with_style: | |
| style_attr=element.get('style','') | |
| import re | |
| bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' | |
| matches=re.findall(bg_pattern,style_attr, re.IGNORECASE) | |
| for match in matches: | |
| if match.startswith('//'): | |
| absolute_bg='https:'+match | |
| style_attr=style_attr.replace(match,absolute_bg) | |
| elif match.startswith('/'): | |
| absolute_bg=urljoin(url,match) | |
| style_attr=style_attr.replace(match,absolute_bg) | |
| elif not match.startswith(('http://','https://')): | |
| absolute_bg=urljoin(url,match) | |
| style_attr=style_attr.replace(match,absolute_bg) | |
| element['style']=style_attr | |
| style_elements=soup.find_all('style') | |
| for style in style_elements: | |
| if style.string: | |
| style_content=style.string | |
| bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' | |
| matches=re.findall(bg_pattern,style_content, re.IGNORECASE) | |
| for match in matches: | |
| if match.startswith('//'): | |
| absolute_bg='https:'+match | |
| style_content=style_content.replace(match,absolute_bg) | |
| elif match.startswith('/'): | |
| absolute_bg=urljoin(url,match) | |
| style_content=style_content.replace(match,absolute_bg) | |
| elif not match.startswith(('http://','https://')): | |
| absolute_bg=urljoin(url,match) | |
| style_content=style_content.replace(match,absolute_bg) | |
| style.string=style_content | |
| images=[] | |
| img_elements=soup.find_all('img') | |
| for img in img_elements: | |
| src=img.get('src','') | |
| alt=img.get('alt','') | |
| if src: | |
| images.append({'src':src,'alt':alt}) | |
| def test_image_url(img_url): | |
| try: | |
| test_response=requests.head(img_url,timeout=5,allow_redirects=True) | |
| return test_response.status_code==200 | |
| except: | |
| return False | |
| working_images=[] | |
| for img in images[:10]: | |
| if test_image_url(img['src']): | |
| working_images.append(img) | |
| modified_html=str(soup) | |
| import re | |
| cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL) | |
| cleaned_html=re.sub(r'\s+',' ',cleaned_html) | |
| cleaned_html=re.sub(r'>\s+<','><',cleaned_html) | |
| if len(cleaned_html)>15000: | |
| cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->" | |
| if not title_text or title_text=="No title found": | |
| title_text=url.split('/')[-1] or url.split('/')[-2] or "Website" | |
| if len(cleaned_html.strip())<100: | |
| website_content=f""" | |
| WEBSITE REDESIGN - EXTRACTION FAILED | |
| ==================================== | |
| URL: {url} | |
| Title: {title_text} | |
| ERROR: Could not extract meaningful HTML content from this website. This could be due to: | |
| 1. The website uses heavy JavaScript to load content dynamically | |
| 2. The website has anti-bot protection | |
| 3. The website requires authentication | |
| 4. The website is using advanced compression or encoding | |
| FALLBACK APPROACH: | |
| Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can: | |
| 1. Create a typical layout for this type of website | |
| 2. Use placeholder content that would be appropriate | |
| 3. Include modern design elements and responsive features | |
| 4. Use a clean, professional design with good typography | |
| 5. Make it mobile-friendly and accessible | |
| This will help me create a better design for you.""" | |
| return website_content.strip() | |
| website_content=f""" | |
| WEBSITE REDESIGN - ORIGINAL HTML CODE | |
| ===[TRUNCATED FOR BREVITY]===""" | |
| return website_content.strip() | |
| except requests.exceptions.HTTPError as e: | |
| if e.response.status_code==403: | |
| return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead." | |
| elif e.response.status_code==404: | |
| return f"Error: Website not found (404). Please check the URL and try again." | |
| elif e.response.status_code>=500: | |
| return f"Error: Website server error ({e.response.status_code}). Please try again later." | |
| else: | |
| return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}" | |
| except requests.exceptions.Timeout: | |
| return "Error: Request timed out. The website may be slow or unavailable." | |
| except requests.exceptions.ConnectionError: | |
| return "Error: Could not connect to the website. Please check your internet connection and the URL." | |
| except requests.exceptions.RequestException as e: | |
| return f"Error accessing website: {str(e)}" | |
| except Exception as e: | |
| return f"Error extracting website content: {str(e)}" | |