Spaces:

yalrashed
/

ScriptLLM

Sleeping

File size: 2,135 Bytes

import re

class AnalysisCleaner:
    def __init__(self):
        self.seen_paragraphs = set()

    def remove_duplicates(self, text: str) -> str:
        """Remove duplicate paragraphs while preserving order"""
        paragraphs = text.split('\n\n')
        unique_paragraphs = []

        for paragraph in paragraphs:
            # Create a normalized version for comparison
            normalized = ' '.join(paragraph.lower().split())
            if normalized and normalized not in self.seen_paragraphs:
                self.seen_paragraphs.add(normalized)
                unique_paragraphs.append(paragraph)

        return '\n\n'.join(unique_paragraphs)

    def reorganize_content(self, text: str) -> str:
        """Convert bullet points into flowing paragraphs"""
        lines = text.split('\n')
        current_paragraph = []
        flowing_text = []

        for line in lines:
            # Remove bullet points and numbering
            cleaned_line = re.sub(r'^\s*[\*\-\•]\s*', '', line)
            cleaned_line = re.sub(r'^\s*\d+\.\s*', '', cleaned_line)

            if cleaned_line.strip():
                if cleaned_line.startswith('###'):  # Keep section headers
                    if current_paragraph:
                        flowing_text.append(' '.join(current_paragraph))
                        current_paragraph = []
                    flowing_text.append(cleaned_line)
                else:
                    current_paragraph.append(cleaned_line)
            elif current_paragraph:
                flowing_text.append(' '.join(current_paragraph))
                current_paragraph = []

        if current_paragraph:
            flowing_text.append(' '.join(current_paragraph))

        return '\n\n'.join(flowing_text)

    def clean_analysis(self, text: str) -> str:
        """Apply all cleanup steps"""
        # Remove duplicate content 
        cleaned = self.remove_duplicates(text)

        # Convert to flowing paragraphs
        cleaned = self.reorganize_content(cleaned)

        # Clean up extra whitespace
        cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)

        return cleaned