Spaces:
Running
Running
| from typing import Any | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter | |
| from langflow.base.textsplitters.model import LCTextSplitterComponent | |
| from langflow.inputs.inputs import DataInput, IntInput, MessageTextInput | |
| from langflow.utils.util import unescape_string | |
| class RecursiveCharacterTextSplitterComponent(LCTextSplitterComponent): | |
| display_name: str = "Recursive Character Text Splitter" | |
| description: str = "Split text trying to keep all related text together." | |
| documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" | |
| name = "RecursiveCharacterTextSplitter" | |
| icon = "LangChain" | |
| inputs = [ | |
| IntInput( | |
| name="chunk_size", | |
| display_name="Chunk Size", | |
| info="The maximum length of each chunk.", | |
| value=1000, | |
| ), | |
| IntInput( | |
| name="chunk_overlap", | |
| display_name="Chunk Overlap", | |
| info="The amount of overlap between chunks.", | |
| value=200, | |
| ), | |
| DataInput( | |
| name="data_input", | |
| display_name="Input", | |
| info="The texts to split.", | |
| input_types=["Document", "Data"], | |
| ), | |
| MessageTextInput( | |
| name="separators", | |
| display_name="Separators", | |
| info='The characters to split on.\nIf left empty defaults to ["\\n\\n", "\\n", " ", ""].', | |
| is_list=True, | |
| ), | |
| ] | |
| def get_data_input(self) -> Any: | |
| return self.data_input | |
| def build_text_splitter(self) -> TextSplitter: | |
| if not self.separators: | |
| separators: list[str] | None = None | |
| else: | |
| # check if the separators list has escaped characters | |
| # if there are escaped characters, unescape them | |
| separators = [unescape_string(x) for x in self.separators] | |
| return RecursiveCharacterTextSplitter( | |
| separators=separators, | |
| chunk_size=self.chunk_size, | |
| chunk_overlap=self.chunk_overlap, | |
| ) | |