| MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/" | |
| LANCEDB_DIRECTORY = "data/lancedb" | |
| LANCEDB_TABLE_NAME = "table" | |
| VECTOR_COLUMN_NAME = "embedding" | |
| TEXT_COLUMN_NAME = "text" | |
| DOCUMENT_PATH_COLUMN_NAME = "document_path" | |
| CHUNK_POLICY = "md" | |
| # CHUNK_POLICY = "txt" | |
| EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| # EMBED_NAME = "text-embedding-ada-002" | |
| TOP_K_RANK = 50 | |
| TOP_K_RERANK = 5 | |
| emb_sizes = { | |
| "sentence-transformers/all-MiniLM-L6-v2": 384, | |
| "thenlper/gte-large": 1024, | |
| "text-embedding-ada-002": 1536, | |
| } | |
| thresh_distances = { | |
| "sentence-transformers/all-MiniLM-L6-v2": 1.2, | |
| "text-embedding-ada-002": 0.5, | |
| } | |
| context_lengths = { | |
| "mistralai/Mistral-7B-Instruct-v0.1": 4096, | |
| "GeneZC/MiniChat-3B": 4096, | |
| "gpt-3.5-turbo": 4096, | |
| "sentence-transformers/all-MiniLM-L6-v2": 128, | |
| "thenlper/gte-large": 512, | |
| "text-embedding-ada-002": 1000, # actual context length is 8191, but it's too much | |
| } | |