Spaces:
Running
Running
| from typing import Any | |
| from langchain_text_splitters import NLTKTextSplitter, TextSplitter | |
| from langflow.base.textsplitters.model import LCTextSplitterComponent | |
| from langflow.inputs import DataInput, IntInput, MessageTextInput | |
| from langflow.utils.util import unescape_string | |
| class NaturalLanguageTextSplitterComponent(LCTextSplitterComponent): | |
| display_name = "Natural Language Text Splitter" | |
| description = "Split text based on natural language boundaries, optimized for a specified language." | |
| documentation = ( | |
| "https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/split_by_token/#nltk" | |
| ) | |
| name = "NaturalLanguageTextSplitter" | |
| icon = "LangChain" | |
| inputs = [ | |
| IntInput( | |
| name="chunk_size", | |
| display_name="Chunk Size", | |
| info="The maximum number of characters in each chunk after splitting.", | |
| value=1000, | |
| ), | |
| IntInput( | |
| name="chunk_overlap", | |
| display_name="Chunk Overlap", | |
| info="The number of characters that overlap between consecutive chunks.", | |
| value=200, | |
| ), | |
| DataInput( | |
| name="data_input", | |
| display_name="Input", | |
| info="The text data to be split.", | |
| input_types=["Document", "Data"], | |
| ), | |
| MessageTextInput( | |
| name="separator", | |
| display_name="Separator", | |
| info='The character(s) to use as a delimiter when splitting text.\nDefaults to "\\n\\n" if left empty.', | |
| ), | |
| MessageTextInput( | |
| name="language", | |
| display_name="Language", | |
| info='The language of the text. Default is "English". ' | |
| "Supports multiple languages for better text boundary recognition.", | |
| ), | |
| ] | |
| def get_data_input(self) -> Any: | |
| return self.data_input | |
| def build_text_splitter(self) -> TextSplitter: | |
| separator = unescape_string(self.separator) if self.separator else "\n\n" | |
| return NLTKTextSplitter( | |
| language=self.language.lower() if self.language else "english", | |
| separator=separator, | |
| chunk_size=self.chunk_size, | |
| chunk_overlap=self.chunk_overlap, | |
| ) | |