Spaces:
Running
Running
| from langchain_text_splitters import CharacterTextSplitter | |
| from langflow.custom import Component | |
| from langflow.io import HandleInput, IntInput, MessageTextInput, Output | |
| from langflow.schema import Data | |
| from langflow.utils.util import unescape_string | |
| class SplitTextComponent(Component): | |
| display_name: str = "Split Text" | |
| description: str = "Split text into chunks based on specified criteria." | |
| icon = "scissors-line-dashed" | |
| name = "SplitText" | |
| inputs = [ | |
| HandleInput( | |
| name="data_inputs", | |
| display_name="Data Inputs", | |
| info="The data to split.", | |
| input_types=["Data"], | |
| is_list=True, | |
| ), | |
| IntInput( | |
| name="chunk_overlap", | |
| display_name="Chunk Overlap", | |
| info="Number of characters to overlap between chunks.", | |
| value=200, | |
| ), | |
| IntInput( | |
| name="chunk_size", | |
| display_name="Chunk Size", | |
| info="The maximum number of characters in each chunk.", | |
| value=1000, | |
| ), | |
| MessageTextInput( | |
| name="separator", | |
| display_name="Separator", | |
| info="The character to split on. Defaults to newline.", | |
| value="\n", | |
| ), | |
| ] | |
| outputs = [ | |
| Output(display_name="Chunks", name="chunks", method="split_text"), | |
| ] | |
| def _docs_to_data(self, docs): | |
| return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] | |
| def split_text(self) -> list[Data]: | |
| separator = unescape_string(self.separator) | |
| documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] | |
| splitter = CharacterTextSplitter( | |
| chunk_overlap=self.chunk_overlap, | |
| chunk_size=self.chunk_size, | |
| separator=separator, | |
| ) | |
| docs = splitter.split_documents(documents) | |
| data = self._docs_to_data(docs) | |
| self.status = data | |
| return data | |