Spaces:
Sleeping
Sleeping
| """Retrieve documentation for a given query.""" | |
| from pathlib import Path | |
| from typing import Any | |
| from rich.console import Console | |
| from tqdm import tqdm | |
| import numpy as np | |
| from manifest import Manifest | |
| from langchain.text_splitter import MarkdownHeaderTextSplitter | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| console = Console(soft_wrap=True) | |
| try: | |
| EMBEDDING_MODEL = Manifest( | |
| client_name="openaiembedding", | |
| ) | |
| except Exception as e: | |
| console.print(e) | |
| console.print( | |
| "Failed to load embedding model. Likely OPENAI API key is not set. Please set to run document retrieval.", | |
| style="bold red", | |
| ) | |
| def load_documentation(path: Path) -> dict[str, str]: | |
| """Load documentation from path.""" | |
| content = {} | |
| for file in path.glob("**/*.md"): | |
| with open(file, "r") as f: | |
| data = f.read() | |
| key = str(file).replace(str(path), "") | |
| content[key] = data | |
| return content | |
| def split_documents(content: dict[str, str]) -> dict[str, Any]: | |
| """Split documents into chunks.""" | |
| md_splitted_docs = [] | |
| markdown_splitter = MarkdownHeaderTextSplitter( | |
| headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")] | |
| ) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| separators=["\n"], chunk_size=500, chunk_overlap=50, length_function=len | |
| ) | |
| for file, raw_doc in content.items(): | |
| splitted_text = markdown_splitter.split_text(raw_doc) | |
| for t in splitted_text: | |
| t.metadata["source"] = file | |
| md_splitted_docs.extend(splitted_text) | |
| docs = text_splitter.split_documents(md_splitted_docs) | |
| docs_as_dict = [doc.dict() for doc in docs] | |
| return docs_as_dict | |
| def get_embeddings(text: str) -> np.ndarray: | |
| """Get embeddings.""" | |
| return np.array(EMBEDDING_MODEL.run(text)) | |
| def embed_documents( | |
| chunked_docs: dict[str, Any], key: str = "page_content" | |
| ) -> tuple[dict[str, Any], np.ndarray]: | |
| """Embed documents.""" | |
| all_embeddings = [] | |
| for doc in tqdm(chunked_docs): | |
| emb = get_embeddings(doc[key]) | |
| doc["embedding"] = emb | |
| all_embeddings.append(doc["embedding"]) | |
| full_embedding_mat = np.vstack(all_embeddings) | |
| return chunked_docs, full_embedding_mat | |
| def query_docs( | |
| query: str, | |
| docs: dict[str, Any], | |
| embedding_mat: np.ndarray, | |
| top_n: int = 10, | |
| key: str = "page_content", | |
| ) -> tuple[list[int], list[str]]: | |
| """Query documents.""" | |
| query_embedding = get_embeddings(query) | |
| scores = embedding_mat.dot(query_embedding) | |
| sorted_indices = np.argsort(scores)[::-1] | |
| top_n_indices = sorted_indices[:top_n] | |
| top_n_indices_rev = top_n_indices[::-1] | |
| returned_docs = [] | |
| for i in top_n_indices_rev: | |
| returned_docs.append(docs[i][key]) | |
| return top_n_indices_rev.tolist(), returned_docs | |