Spaces:
Running
Running
| from langchain.docstore.document import Document | |
| from langchain_experimental.text_splitter import SemanticChunker | |
| from langflow.base.textsplitters.model import LCTextSplitterComponent | |
| from langflow.io import ( | |
| DropdownInput, | |
| FloatInput, | |
| HandleInput, | |
| IntInput, | |
| MessageTextInput, | |
| Output, | |
| ) | |
| from langflow.schema import Data | |
| class SemanticTextSplitterComponent(LCTextSplitterComponent): | |
| """Split text into semantically meaningful chunks using semantic similarity.""" | |
| display_name: str = "Semantic Text Splitter" | |
| name: str = "SemanticTextSplitter" | |
| description: str = "Split text into semantically meaningful chunks using semantic similarity." | |
| documentation = "https://python.langchain.com/docs/how_to/semantic-chunker/" | |
| beta = True # this component is beta because it is imported from langchain_experimental | |
| icon = "LangChain" | |
| inputs = [ | |
| HandleInput( | |
| name="data_inputs", | |
| display_name="Data Inputs", | |
| info="List of Data objects containing text and metadata to split.", | |
| input_types=["Data"], | |
| is_list=True, | |
| ), | |
| HandleInput( | |
| name="embeddings", | |
| display_name="Embeddings", | |
| info="Embeddings model to use for semantic similarity. Required.", | |
| input_types=["Embeddings"], | |
| is_list=False, | |
| ), | |
| DropdownInput( | |
| name="breakpoint_threshold_type", | |
| display_name="Breakpoint Threshold Type", | |
| info=( | |
| "Method to determine breakpoints. Options: 'percentile', " | |
| "'standard_deviation', 'interquartile'. Defaults to 'percentile'." | |
| ), | |
| value="percentile", | |
| options=["percentile", "standard_deviation", "interquartile"], | |
| ), | |
| FloatInput( | |
| name="breakpoint_threshold_amount", | |
| display_name="Breakpoint Threshold Amount", | |
| info="Numerical amount for the breakpoint threshold.", | |
| value=0.5, | |
| ), | |
| IntInput( | |
| name="number_of_chunks", | |
| display_name="Number of Chunks", | |
| info="Number of chunks to split the text into.", | |
| value=5, | |
| ), | |
| MessageTextInput( | |
| name="sentence_split_regex", | |
| display_name="Sentence Split Regex", | |
| info="Regular expression to split sentences. Optional.", | |
| value="", | |
| advanced=True, | |
| ), | |
| IntInput( | |
| name="buffer_size", | |
| display_name="Buffer Size", | |
| info="Size of the buffer.", | |
| value=0, | |
| advanced=True, | |
| ), | |
| ] | |
| outputs = [ | |
| Output(display_name="Chunks", name="chunks", method="split_text"), | |
| ] | |
| def _docs_to_data(self, docs: list[Document]) -> list[Data]: | |
| """Convert a list of Document objects to Data objects.""" | |
| return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] | |
| def split_text(self) -> list[Data]: | |
| """Split the input data into semantically meaningful chunks.""" | |
| try: | |
| embeddings = getattr(self, "embeddings", None) | |
| if embeddings is None: | |
| error_msg = "An embeddings model is required for SemanticTextSplitter." | |
| raise ValueError(error_msg) | |
| if not self.data_inputs: | |
| error_msg = "Data inputs cannot be empty." | |
| raise ValueError(error_msg) | |
| documents = [] | |
| for _input in self.data_inputs: | |
| if isinstance(_input, Data): | |
| documents.append(_input.to_lc_document()) | |
| else: | |
| error_msg = f"Invalid data input type: {_input}" | |
| raise TypeError(error_msg) | |
| if not documents: | |
| error_msg = "No valid Data objects found in data_inputs." | |
| raise ValueError(error_msg) | |
| texts = [doc.page_content for doc in documents] | |
| metadatas = [doc.metadata for doc in documents] | |
| splitter_params = { | |
| "embeddings": embeddings, | |
| "breakpoint_threshold_type": self.breakpoint_threshold_type or "percentile", | |
| "breakpoint_threshold_amount": self.breakpoint_threshold_amount, | |
| "number_of_chunks": self.number_of_chunks, | |
| "buffer_size": self.buffer_size, | |
| } | |
| if self.sentence_split_regex: | |
| splitter_params["sentence_split_regex"] = self.sentence_split_regex | |
| splitter = SemanticChunker(**splitter_params) | |
| docs = splitter.create_documents(texts, metadatas=metadatas) | |
| data = self._docs_to_data(docs) | |
| self.status = data | |
| except Exception as e: | |
| error_msg = f"An error occurred during semantic splitting: {e}" | |
| raise RuntimeError(error_msg) from e | |
| else: | |
| return data | |