Spaces:
Runtime error
Runtime error
Commit
·
49d583d
1
Parent(s):
56d3953
add: SemanticChunker
Browse files- medrag_multi_modal/semantic_chunker.py +52 -0
- pyproject.toml +4 -0
medrag_multi_modal/semantic_chunker.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Callable, Optional, Union
|
| 2 |
+
|
| 3 |
+
import semchunk
|
| 4 |
+
import tiktoken
|
| 5 |
+
import tokenizers
|
| 6 |
+
import weave
|
| 7 |
+
from rich.progress import track
|
| 8 |
+
from transformers import PreTrainedTokenizer
|
| 9 |
+
|
| 10 |
+
TOKENIZER_OR_TOKEN_COUNTER = Union[
|
| 11 |
+
str,
|
| 12 |
+
tiktoken.Encoding,
|
| 13 |
+
PreTrainedTokenizer,
|
| 14 |
+
tokenizers.Tokenizer,
|
| 15 |
+
Callable[[str], int],
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class SemanticChunker:
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
|
| 23 |
+
chunk_size: Optional[int] = None,
|
| 24 |
+
max_token_chars: Optional[int] = None,
|
| 25 |
+
memoize: bool = True,
|
| 26 |
+
) -> None:
|
| 27 |
+
self.chunker = semchunk.chunkerify(
|
| 28 |
+
tokenizer_or_token_counter,
|
| 29 |
+
chunk_size=chunk_size,
|
| 30 |
+
max_token_chars=max_token_chars,
|
| 31 |
+
memoize=memoize,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
def chunk_and_publish(
|
| 35 |
+
self, document_dataset_name: str, chunk_dataset_name: Optional[str] = None
|
| 36 |
+
) -> None:
|
| 37 |
+
document_dataset = weave.ref(document_dataset_name).get().rows
|
| 38 |
+
chunks = []
|
| 39 |
+
for idx, document in track(
|
| 40 |
+
enumerate(document_dataset), description="Chunking documents"
|
| 41 |
+
):
|
| 42 |
+
document_chunks = self.chunker.chunk(str(document["text"]))
|
| 43 |
+
for chunk in document_chunks:
|
| 44 |
+
chunks.append(
|
| 45 |
+
{
|
| 46 |
+
"document_idx": idx,
|
| 47 |
+
"document_name": document["document_name"],
|
| 48 |
+
"page_idx": document["page_idx"],
|
| 49 |
+
"text": chunk,
|
| 50 |
+
}
|
| 51 |
+
)
|
| 52 |
+
weave.publish(weave.Dataset(name=chunk_dataset_name, rows=chunks))
|
pyproject.toml
CHANGED
|
@@ -29,6 +29,8 @@ dependencies = [
|
|
| 29 |
"mkdocs-jupyter>=0.25.0",
|
| 30 |
"jupyter>=1.1.1",
|
| 31 |
"pdfplumber>=0.11.4",
|
|
|
|
|
|
|
| 32 |
]
|
| 33 |
|
| 34 |
[project.optional-dependencies]
|
|
@@ -41,6 +43,8 @@ core = [
|
|
| 41 |
"PyPDF2>=3.0.1",
|
| 42 |
"python-dotenv>=1.0.1",
|
| 43 |
"pymupdf4llm>=0.0.17",
|
|
|
|
|
|
|
| 44 |
"torch>=2.4.1",
|
| 45 |
"weave>=0.51.14",
|
| 46 |
]
|
|
|
|
| 29 |
"mkdocs-jupyter>=0.25.0",
|
| 30 |
"jupyter>=1.1.1",
|
| 31 |
"pdfplumber>=0.11.4",
|
| 32 |
+
"semchunk>=2.2.0",
|
| 33 |
+
"tiktoken>=0.8.0",
|
| 34 |
]
|
| 35 |
|
| 36 |
[project.optional-dependencies]
|
|
|
|
| 43 |
"PyPDF2>=3.0.1",
|
| 44 |
"python-dotenv>=1.0.1",
|
| 45 |
"pymupdf4llm>=0.0.17",
|
| 46 |
+
"semchunk>=2.2.0",
|
| 47 |
+
"tiktoken>=0.8.0",
|
| 48 |
"torch>=2.4.1",
|
| 49 |
"weave>=0.51.14",
|
| 50 |
]
|