Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +138 -62
- data/Fab2Esp_transparent.png +0 -0
- parameters.py +3 -2
- src/fabrics_processor/config.py +10 -9
- src/fabrics_processor/database.py +11 -9
- src/fabrics_processor/database_updater.py +32 -27
- src/search_qdrant/streamlit_app.py +9 -9
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/Fab2Esp_transparent.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -6,98 +6,174 @@ sdk_version: 5.12.0
|
|
| 6 |
---
|
| 7 |
# Fabric to Espanso Converter
|
| 8 |
|
| 9 |
-
A Python application that bridges Fabric prompts with Espanso by managing and converting prompts through a vector database.
|
|
|
|
| 10 |
|
| 11 |
## Features
|
| 12 |
|
| 13 |
-
- Store and manage Fabric prompts in a Qdrant vector database
|
| 14 |
-
- Convert stored prompts into Espanso YAML format for system-wide usage
|
| 15 |
-
-
|
| 16 |
-
- Web interface for easy
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
## Prerequisites
|
| 19 |
|
| 20 |
-
- Python 3.11
|
| 21 |
-
-
|
| 22 |
-
-
|
| 23 |
-
-
|
|
|
|
| 24 |
|
| 25 |
## Installation
|
| 26 |
|
| 27 |
-
1.
|
| 28 |
-
2. In Obsidian, create the following folder structure:
|
| 29 |
-
```
|
| 30 |
-
Extra/
|
| 31 |
-
└── FabricPatterns/
|
| 32 |
-
├── Official/ # For downloaded Fabric patterns
|
| 33 |
-
└── Own/ # For your custom additions
|
| 34 |
-
```
|
| 35 |
-
3. Clone this repository
|
| 36 |
-
4. Install dependencies using PDM:
|
| 37 |
```bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
pdm install
|
| 39 |
```
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
## Usage
|
| 43 |
|
| 44 |
-
###
|
| 45 |
|
| 46 |
-
|
| 47 |
```bash
|
| 48 |
-
|
|
|
|
| 49 |
```
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
Create a PowerShell script with the following content to start the application:
|
| 54 |
-
|
| 55 |
```powershell
|
| 56 |
-
#
|
| 57 |
-
|
| 58 |
-
$startInfo.Filename = "wsl.exe"
|
| 59 |
-
# Use -c flag to let the command use the WSL2 Ubuntu folder system and not the Windows
|
| 60 |
-
$startInfo.Arguments = "bash -c ~/Tools/pythagora-core/workspace/fabrics_processor/src/search_qdrant/run_streamlit.sh"
|
| 61 |
-
$startInfo.UseShellExecute = $false
|
| 62 |
-
$startInfo.RedirectStandardOutput = $true
|
| 63 |
-
$startInfo.RedirectStandardError = $true
|
| 64 |
-
$startInfo.WindowStyle = [System.Diagnostics.ProcessWindowStyle]::Hidden
|
| 65 |
-
$startInfo.CreateNoWindow = $true
|
| 66 |
-
|
| 67 |
-
# Start the process
|
| 68 |
-
try {
|
| 69 |
-
$process = [System.Diagnostics.Process]::Start($startInfo)
|
| 70 |
-
Start-Sleep -Seconds 5
|
| 71 |
-
|
| 72 |
-
# Check if Streamlit is actually running
|
| 73 |
-
$streamlitRunning = Test-NetConnection -ComputerName localhost -Port 8501 -WarningAction SilentlyContinue
|
| 74 |
-
|
| 75 |
-
if ($streamlitRunning.TcpTestSucceeded) {
|
| 76 |
-
Start-Process "msedge.exe" "--app=http://localhost:8501"
|
| 77 |
-
} else {
|
| 78 |
-
Write-Error "Failed to start Streamlit application"
|
| 79 |
-
}
|
| 80 |
-
} catch {
|
| 81 |
-
Write-Error "Error starting Streamlit: $_"
|
| 82 |
-
}
|
| 83 |
```
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
## Dependencies
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
-
|
| 95 |
- qdrant-client >= 1.12.1
|
| 96 |
- fastembed >= 0.4.2
|
| 97 |
-
-
|
| 98 |
- pyperclip >= 1.9.0
|
|
|
|
| 99 |
- regex >= 2024.11.6
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
## License
|
| 102 |
|
| 103 |
This project is licensed under the MIT License.
|
|
|
|
| 6 |
---
|
| 7 |
# Fabric to Espanso Converter
|
| 8 |
|
| 9 |
+
A Python application that bridges Fabric prompts with Espanso and Obsidian Textgenerator by managing and converting prompts through a vector database. It enables semantic search and efficient management of prompts while providing a modern web interface for easy interaction.
|
| 10 |
+
There's also a seperate gradio app that can be hosted on Hugging Face Spaces to provide a query-only interface.
|
| 11 |
|
| 12 |
## Features
|
| 13 |
|
| 14 |
+
- **Vector Database Integration**: Store and manage Fabric prompts in a Qdrant vector database with semantic search capabilities
|
| 15 |
+
- **Automated Conversion**: Convert stored prompts into Espanso YAML format for system-wide usage
|
| 16 |
+
- **Change Detection**: Automatically detect and process changes in the Fabric patterns folder
|
| 17 |
+
- **Web Interface**: Modern Gradio-based interface for easy prompt searching and management
|
| 18 |
+
- **Semantic Search**: Find relevant prompts based on their meaning, not just exact matches
|
| 19 |
+
- **Clipboard Integration**: Quick copying of prompts directly to clipboard
|
| 20 |
+
- **Logging System**: Comprehensive logging for tracking operations and debugging
|
| 21 |
|
| 22 |
## Prerequisites
|
| 23 |
|
| 24 |
+
- Python 3.11 or higher
|
| 25 |
+
- Fabric (https://github.com/danielmiessler/fabric)
|
| 26 |
+
- Qdrant vector database (local or cloud instance)
|
| 27 |
+
- Obsidian with TextGenerator plugin (https://github.com/obsidianmd/obsidian-textgenerator)
|
| 28 |
+
- Linux/WSL2 or Windows with WSL2
|
| 29 |
|
| 30 |
## Installation
|
| 31 |
|
| 32 |
+
1. **Environment Setup**:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
```bash
|
| 34 |
+
# Clone the repository
|
| 35 |
+
git clone [repository-url]
|
| 36 |
+
cd fabric_to_espanso
|
| 37 |
+
|
| 38 |
+
# Install PDM if not already installed
|
| 39 |
+
pip install pdm
|
| 40 |
+
|
| 41 |
+
# Install dependencies
|
| 42 |
pdm install
|
| 43 |
```
|
| 44 |
+
|
| 45 |
+
2. **Configuration**:
|
| 46 |
+
- Copy `.env.example` to `.env`
|
| 47 |
+
- Set your Qdrant API key in `.env`:
|
| 48 |
+
```
|
| 49 |
+
QDRANT_API_KEY=your_api_key_here
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
3. **Obsidian Setup**:
|
| 53 |
+
- Install Obsidian and the TextGenerator plugin
|
| 54 |
+
- Create the folder structure:
|
| 55 |
+
```
|
| 56 |
+
Extra/
|
| 57 |
+
└── FabricPatterns/
|
| 58 |
+
├── Official/ # Official Fabric patterns
|
| 59 |
+
└── Own/ # Custom patterns
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
4. **Fabric Setup**:
|
| 63 |
+
- Install Fabric, see https://github.com/danielmiessler/fabric
|
| 64 |
+
|
| 65 |
+
5. **QDRANT Setup**:
|
| 66 |
+
- Install Qdrant, see https://qdrant.io/en/
|
| 67 |
+
- Start Qdrant server
|
| 68 |
+
|
| 69 |
+
6. **Parameters**:
|
| 70 |
+
- Set all the parameters in the file `parameters.py`.
|
| 71 |
+
|
| 72 |
+
7. **Optional**:
|
| 73 |
+
- Create a Powershell script to run the Streamlit app
|
| 74 |
+
|
| 75 |
|
| 76 |
## Usage
|
| 77 |
|
| 78 |
+
### Starting the Application
|
| 79 |
|
| 80 |
+
#### Linux/WSL2
|
| 81 |
```bash
|
| 82 |
+
# Start the Gradio interface
|
| 83 |
+
python gradio_app_query_only.py
|
| 84 |
```
|
| 85 |
|
| 86 |
+
#### Windows (with WSL2)
|
|
|
|
|
|
|
|
|
|
| 87 |
```powershell
|
| 88 |
+
# Use the provided PowerShell script
|
| 89 |
+
./start_app.ps1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
```
|
| 91 |
|
| 92 |
+
### Core Operations
|
| 93 |
+
|
| 94 |
+
1. **Search Prompts**:
|
| 95 |
+
- Enter your search query in the search box
|
| 96 |
+
- Results are ranked by semantic similarity
|
| 97 |
+
- Click on a result to view its contents
|
| 98 |
+
|
| 99 |
+
2. **Copy Prompts**:
|
| 100 |
+
- Select a prompt from the results
|
| 101 |
+
- Click "Copy to Clipboard" to copy the prompt text
|
| 102 |
+
|
| 103 |
+
3. **Update Database**:
|
| 104 |
+
- Run `python main.py` to process changes in the Fabric patterns folder
|
| 105 |
+
- New and modified prompts are automatically added to the database
|
| 106 |
+
- Deleted prompts are removed from the database
|
| 107 |
+
|
| 108 |
+
## Project Structure
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
fabric_to_espanso/
|
| 112 |
+
├── src/
|
| 113 |
+
│ ├── fabrics_processor/ # Core processing logic
|
| 114 |
+
│ └── search_qdrant/ # Search functionality
|
| 115 |
+
├── gradio_app_query_only.py # Web interface
|
| 116 |
+
├── main.py # CLI entry point
|
| 117 |
+
└── parameters.py # Configuration parameters
|
| 118 |
+
```
|
| 119 |
|
| 120 |
## Dependencies
|
| 121 |
|
| 122 |
+
Core dependencies are managed through PDM:
|
| 123 |
+
|
| 124 |
+
- gradio >= 5.12.0
|
| 125 |
- qdrant-client >= 1.12.1
|
| 126 |
- fastembed >= 0.4.2
|
| 127 |
+
- python-dotenv
|
| 128 |
- pyperclip >= 1.9.0
|
| 129 |
+
- pyyaml >= 6.0.2
|
| 130 |
- regex >= 2024.11.6
|
| 131 |
|
| 132 |
+
## TODO
|
| 133 |
+
|
| 134 |
+
The following items need to be addressed to improve code quality, maintainability, and functionality:
|
| 135 |
+
|
| 136 |
+
### Database Optimization
|
| 137 |
+
- Check the database for any points with exactly the same vector or nearly the same. Remove those to reduce redundancy and improve search efficiency.
|
| 138 |
+
|
| 139 |
+
### Metadata Enhancement
|
| 140 |
+
- If available, use the readme.md file from the fabrics folder to fill the "purpose" field in the database entries.
|
| 141 |
+
- If readme.md is not available in the fabrics folder, create the "purpose" field from an LLM response that summarizes the goal of the fabric file.
|
| 142 |
+
|
| 143 |
+
### UI/UX Improvements
|
| 144 |
+
- Add a compare interface to the gradio app to allow side-by-side comparison of prompts.
|
| 145 |
+
- Remove the streamlit_only_query app as it's being replaced by the gradio interface.
|
| 146 |
+
|
| 147 |
+
### Code Refactoring
|
| 148 |
+
- Implement proper error handling for database operations.
|
| 149 |
+
- Add comprehensive logging throughout the application.
|
| 150 |
+
- Create unit tests for core functionality.
|
| 151 |
+
- Implement type hints consistently across all Python files.
|
| 152 |
+
- Add input validation for all user-provided data.
|
| 153 |
+
- Refactor the database operations into a dedicated class.
|
| 154 |
+
- Implement connection pooling for better database performance.
|
| 155 |
+
- Add docstrings to all functions and classes.
|
| 156 |
+
- Create a configuration class to handle all settings.
|
| 157 |
+
- Add proper cleanup of resources in error cases.
|
| 158 |
+
|
| 159 |
+
### Documentation
|
| 160 |
+
- Add API documentation for all public interfaces.
|
| 161 |
+
- Include examples for common use cases.
|
| 162 |
+
- Document the database schema and vector space organization.
|
| 163 |
+
- Add contribution guidelines.
|
| 164 |
+
- Include troubleshooting section.
|
| 165 |
+
|
| 166 |
+
### Security
|
| 167 |
+
- Implement proper environment variable handling.
|
| 168 |
+
- Add input sanitization for all user inputs.
|
| 169 |
+
- Implement rate limiting for the web interface.
|
| 170 |
+
- Add proper authentication for the web interface.
|
| 171 |
+
|
| 172 |
+
### Performance
|
| 173 |
+
- Implement caching for frequently accessed prompts.
|
| 174 |
+
- Optimize vector similarity search parameters.
|
| 175 |
+
- Add batch processing for large-scale operations.
|
| 176 |
+
|
| 177 |
## License
|
| 178 |
|
| 179 |
This project is licensed under the MIT License.
|
data/Fab2Esp_transparent.png
CHANGED
|
|
Git LFS Details
|
parameters.py
CHANGED
|
@@ -47,7 +47,7 @@ BASE_WORDS = ['Identity', 'Purpose', 'Task', 'Goal']
|
|
| 47 |
# COLLECTION_NAME = "fabric_patterns"
|
| 48 |
# Cloud:
|
| 49 |
QDRANT_URL = "https://91ed3a93-6135-4951-a624-1c8c2878240d.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
| 50 |
-
COLLECTION_NAME = "
|
| 51 |
|
| 52 |
# Required fields for database points
|
| 53 |
# TODO: default trigger wordt nu twee keer gedefinieerd, oplossen
|
|
@@ -61,4 +61,5 @@ REQUIRED_FIELDS_DEFAULTS = {
|
|
| 61 |
|
| 62 |
# Embedding Model parameters voor Qdrant
|
| 63 |
USE_FASTEMBED = True
|
| 64 |
-
|
|
|
|
|
|
| 47 |
# COLLECTION_NAME = "fabric_patterns"
|
| 48 |
# Cloud:
|
| 49 |
QDRANT_URL = "https://91ed3a93-6135-4951-a624-1c8c2878240d.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
| 50 |
+
COLLECTION_NAME = "fabric_patterns_hybrid"
|
| 51 |
|
| 52 |
# Required fields for database points
|
| 53 |
# TODO: default trigger wordt nu twee keer gedefinieerd, oplossen
|
|
|
|
| 61 |
|
| 62 |
# Embedding Model parameters voor Qdrant
|
| 63 |
USE_FASTEMBED = True
|
| 64 |
+
EMBED_MODEL_DENSE = 'BAAI/bge-base-en' # "fast-bge-small-en"
|
| 65 |
+
EMBED_MODEL_SPARSE = "prithivida/Splade_PP_en_v1"
|
src/fabrics_processor/config.py
CHANGED
|
@@ -16,7 +16,8 @@ from parameters import (
|
|
| 16 |
BASE_WORDS,
|
| 17 |
QDRANT_URL,
|
| 18 |
USE_FASTEMBED,
|
| 19 |
-
|
|
|
|
| 20 |
COLLECTION_NAME,
|
| 21 |
REQUIRED_FIELDS,
|
| 22 |
REQUIRED_FIELDS_DEFAULTS
|
|
@@ -61,22 +62,22 @@ class DatabaseConfig:
|
|
| 61 |
raise ConfigurationError(str(e))
|
| 62 |
|
| 63 |
@dataclass
|
| 64 |
-
class
|
| 65 |
"""Embedding model configuration."""
|
| 66 |
use_fastembed: bool = USE_FASTEMBED
|
| 67 |
-
model_name: str = EMBED_MODEL
|
| 68 |
collection_name: str = COLLECTION_NAME
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
def validate(self) -> None:
|
| 72 |
"""Validate the embedding configuration."""
|
| 73 |
-
if not self.
|
| 74 |
from .exceptions import ConfigurationError
|
| 75 |
-
raise ConfigurationError("Embedding model name cannot be empty")
|
| 76 |
|
| 77 |
-
if self.
|
| 78 |
from .exceptions import ConfigurationError
|
| 79 |
-
raise ConfigurationError(
|
| 80 |
|
| 81 |
class Config:
|
| 82 |
"""Global configuration singleton."""
|
|
@@ -86,7 +87,7 @@ class Config:
|
|
| 86 |
if cls._instance is None:
|
| 87 |
cls._instance = super().__new__(cls)
|
| 88 |
cls._instance.database = DatabaseConfig()
|
| 89 |
-
cls._instance.embedding =
|
| 90 |
cls._instance.espanso_trigger = DEFAULT_TRIGGER
|
| 91 |
cls._instance.fabric_patterns_folder = FABRIC_PATTERNS_FOLDER
|
| 92 |
cls._instance.yaml_output_folder = YAML_OUTPUT_FOLDER
|
|
|
|
| 16 |
BASE_WORDS,
|
| 17 |
QDRANT_URL,
|
| 18 |
USE_FASTEMBED,
|
| 19 |
+
EMBED_MODEL_DENSE,
|
| 20 |
+
EMBED_MODEL_SPARSE,
|
| 21 |
COLLECTION_NAME,
|
| 22 |
REQUIRED_FIELDS,
|
| 23 |
REQUIRED_FIELDS_DEFAULTS
|
|
|
|
| 62 |
raise ConfigurationError(str(e))
|
| 63 |
|
| 64 |
@dataclass
|
| 65 |
+
class EmbeddingModelConfig:
|
| 66 |
"""Embedding model configuration."""
|
| 67 |
use_fastembed: bool = USE_FASTEMBED
|
|
|
|
| 68 |
collection_name: str = COLLECTION_NAME
|
| 69 |
+
dense_model_name: str = EMBED_MODEL_DENSE
|
| 70 |
+
sparse_model_name: str = EMBED_MODEL_SPARSE
|
| 71 |
|
| 72 |
def validate(self) -> None:
|
| 73 |
"""Validate the embedding configuration."""
|
| 74 |
+
if not self.dense_model_name:
|
| 75 |
from .exceptions import ConfigurationError
|
| 76 |
+
raise ConfigurationError("Dense Embedding model name cannot be empty")
|
| 77 |
|
| 78 |
+
if not self.sparse_model_name:
|
| 79 |
from .exceptions import ConfigurationError
|
| 80 |
+
raise ConfigurationError("Sparse Embedding model name cannot be empty")
|
| 81 |
|
| 82 |
class Config:
|
| 83 |
"""Global configuration singleton."""
|
|
|
|
| 87 |
if cls._instance is None:
|
| 88 |
cls._instance = super().__new__(cls)
|
| 89 |
cls._instance.database = DatabaseConfig()
|
| 90 |
+
cls._instance.embedding = EmbeddingModelConfig()
|
| 91 |
cls._instance.espanso_trigger = DEFAULT_TRIGGER
|
| 92 |
cls._instance.fabric_patterns_folder = FABRIC_PATTERNS_FOLDER
|
| 93 |
cls._instance.yaml_output_folder = YAML_OUTPUT_FOLDER
|
src/fabrics_processor/database.py
CHANGED
|
@@ -52,7 +52,8 @@ def initialize_qdrant_database(
|
|
| 52 |
api_key: Optional[str] = "",
|
| 53 |
collection_name: str = config.embedding.collection_name,
|
| 54 |
use_fastembed: bool = config.embedding.use_fastembed,
|
| 55 |
-
|
|
|
|
| 56 |
) -> QdrantClient:
|
| 57 |
"""Initialize the Qdrant database for storing markdown file information.
|
| 58 |
|
|
@@ -75,6 +76,9 @@ def initialize_qdrant_database(
|
|
| 75 |
|
| 76 |
# Create database connection
|
| 77 |
client = create_database_connection(url=url, api_key=api_key)
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# Check if collection exists
|
| 80 |
collections = client.get_collections()
|
|
@@ -85,19 +89,17 @@ def initialize_qdrant_database(
|
|
| 85 |
|
| 86 |
# Create collection with appropriate vector configuration
|
| 87 |
if use_fastembed:
|
| 88 |
-
|
|
|
|
| 89 |
else:
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
size=config.embedding.vector_size,
|
| 93 |
-
distance=Distance.COSINE
|
| 94 |
-
)
|
| 95 |
-
}
|
| 96 |
|
| 97 |
try:
|
| 98 |
client.create_collection(
|
| 99 |
collection_name=collection_name,
|
| 100 |
-
vectors_config=
|
|
|
|
| 101 |
on_disk_payload=True
|
| 102 |
)
|
| 103 |
except exceptions.UnexpectedResponse as e:
|
|
|
|
| 52 |
api_key: Optional[str] = "",
|
| 53 |
collection_name: str = config.embedding.collection_name,
|
| 54 |
use_fastembed: bool = config.embedding.use_fastembed,
|
| 55 |
+
dense_model: str = config.embedding.dense_model_name,
|
| 56 |
+
sparse_model: str = config.embedding.sparse_model_name
|
| 57 |
) -> QdrantClient:
|
| 58 |
"""Initialize the Qdrant database for storing markdown file information.
|
| 59 |
|
|
|
|
| 76 |
|
| 77 |
# Create database connection
|
| 78 |
client = create_database_connection(url=url, api_key=api_key)
|
| 79 |
+
|
| 80 |
+
client.set_model(dense_model)
|
| 81 |
+
client.set_sparse_model(sparse_model)
|
| 82 |
|
| 83 |
# Check if collection exists
|
| 84 |
collections = client.get_collections()
|
|
|
|
| 89 |
|
| 90 |
# Create collection with appropriate vector configuration
|
| 91 |
if use_fastembed:
|
| 92 |
+
vectors_config = client.get_fastembed_vector_params()
|
| 93 |
+
sparse_vectors_config = client.get_fastembed_sparse_vector_params()
|
| 94 |
else:
|
| 95 |
+
print("Creating database without Fastembed not implemented yet.")
|
| 96 |
+
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
try:
|
| 99 |
client.create_collection(
|
| 100 |
collection_name=collection_name,
|
| 101 |
+
vectors_config=vectors_config,
|
| 102 |
+
sparse_vectors_config=sparse_vectors_config,
|
| 103 |
on_disk_payload=True
|
| 104 |
)
|
| 105 |
except exceptions.UnexpectedResponse as e:
|
src/fabrics_processor/database_updater.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from typing import Optional
|
| 2 |
from qdrant_client import QdrantClient
|
| 3 |
from qdrant_client.http.models import PointStruct, Filter, FieldCondition, MatchValue, PointIdsList
|
| 4 |
-
from fastembed import TextEmbedding
|
| 5 |
import logging
|
| 6 |
import uuid
|
| 7 |
from .output_files_generator import generate_yaml_file, generate_markdown_files
|
|
@@ -11,7 +11,7 @@ from .database import validate_point_payload
|
|
| 11 |
|
| 12 |
logger = logging.getLogger('fabric_to_espanso')
|
| 13 |
|
| 14 |
-
def get_embedding(text: str
|
| 15 |
"""
|
| 16 |
Generate embedding vector for the given text using FastEmbed.
|
| 17 |
|
|
@@ -19,10 +19,25 @@ def get_embedding(text: str, embedding_model: TextEmbedding) -> list:
|
|
| 19 |
text (str): Text to generate embedding for
|
| 20 |
|
| 21 |
Returns:
|
| 22 |
-
list:
|
| 23 |
"""
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def update_qdrant_database(client: QdrantClient, collection_name: str, new_files: list, modified_files: list, deleted_files: list):
|
| 28 |
"""
|
|
@@ -34,16 +49,10 @@ def update_qdrant_database(client: QdrantClient, collection_name: str, new_files
|
|
| 34 |
modified_files (list): List of modified files to be updated in the database.
|
| 35 |
deleted_files (list): List of deleted files to be removed from the database.
|
| 36 |
"""
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
logger.info(f"Initializing FastEmbed model.")
|
| 42 |
-
embedding_model = TextEmbedding()
|
| 43 |
-
else:
|
| 44 |
-
logger.info(f"Initializing embbeding model: {config.model_name}")
|
| 45 |
-
# TODO: testen. Weet niet of dit werkt.
|
| 46 |
-
embedding_model = TextEmbedding(model_name=config.model_name)
|
| 47 |
|
| 48 |
try:
|
| 49 |
# Add new files
|
|
@@ -52,9 +61,10 @@ def update_qdrant_database(client: QdrantClient, collection_name: str, new_files
|
|
| 52 |
payload_new = validate_point_payload(file)
|
| 53 |
point = PointStruct(
|
| 54 |
id=str(uuid.uuid4()), # Generate a new UUID for each point
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
payload={
|
| 59 |
"filename": payload_new['filename'],
|
| 60 |
"content": payload_new['content'],
|
|
@@ -87,15 +97,10 @@ def update_qdrant_database(client: QdrantClient, collection_name: str, new_files
|
|
| 87 |
# Update the existing point with the new file data
|
| 88 |
point = PointStruct(
|
| 89 |
id=point_id,
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
# Zie https://github.com/qdrant/qdrant-client/discussions/598
|
| 95 |
-
# De naam die fastembed gebruikt is afhankelijk van het model dat je gebruikt.
|
| 96 |
-
# Je kunt de naam vinden door: client.get_vector_field_name()
|
| 97 |
-
vector={'fast-bge-small-en':
|
| 98 |
-
get_embedding(file['purpose'], embedding_model)}, # Generate vector from purpose field
|
| 99 |
payload={
|
| 100 |
"filename": payload_current['filename'],
|
| 101 |
"content": file['content'],
|
|
|
|
| 1 |
from typing import Optional
|
| 2 |
from qdrant_client import QdrantClient
|
| 3 |
from qdrant_client.http.models import PointStruct, Filter, FieldCondition, MatchValue, PointIdsList
|
| 4 |
+
from fastembed import TextEmbedding, SparseTextEmbedding
|
| 5 |
import logging
|
| 6 |
import uuid
|
| 7 |
from .output_files_generator import generate_yaml_file, generate_markdown_files
|
|
|
|
| 11 |
|
| 12 |
logger = logging.getLogger('fabric_to_espanso')
|
| 13 |
|
| 14 |
+
def get_embedding(text: str) -> list:
|
| 15 |
"""
|
| 16 |
Generate embedding vector for the given text using FastEmbed.
|
| 17 |
|
|
|
|
| 19 |
text (str): Text to generate embedding for
|
| 20 |
|
| 21 |
Returns:
|
| 22 |
+
list: Tuple of (dense_embeddings, sparse_embeddings)
|
| 23 |
"""
|
| 24 |
+
if not config.embedding.use_fastembed:
|
| 25 |
+
msg = "Embedding model not initialized. Set use_fastembed to True in the configuration."
|
| 26 |
+
logger.error(msg)
|
| 27 |
+
raise ConfigurationError(msg)
|
| 28 |
+
|
| 29 |
+
# Models are lazily initialized only when needed
|
| 30 |
+
if not hasattr(get_embedding, '_dense_model'):
|
| 31 |
+
get_embedding._dense_model = TextEmbedding(model_name=config.embedding.dense_model_name)
|
| 32 |
+
if not hasattr(get_embedding, '_sparse_model'):
|
| 33 |
+
get_embedding._sparse_model = SparseTextEmbedding(model_name=config.embedding.sparse_model_name)
|
| 34 |
+
|
| 35 |
+
dense_embeddings = list(get_embedding._dense_model.embed(text))[0]
|
| 36 |
+
sparse_embedding = list(get_embedding._sparse_model.embed(text, return_dense=False))[0]
|
| 37 |
+
return dense_embeddings, {
|
| 38 |
+
'indices': sparse_embedding.indices.tolist(),
|
| 39 |
+
'values': sparse_embedding.values.tolist()
|
| 40 |
+
}
|
| 41 |
|
| 42 |
def update_qdrant_database(client: QdrantClient, collection_name: str, new_files: list, modified_files: list, deleted_files: list):
|
| 43 |
"""
|
|
|
|
| 49 |
modified_files (list): List of modified files to be updated in the database.
|
| 50 |
deleted_files (list): List of deleted files to be removed from the database.
|
| 51 |
"""
|
| 52 |
+
if not config.embedding.use_fastembed:
|
| 53 |
+
msg = "Embedding model not initialized. Set use_fastembed to True in the configuration."
|
| 54 |
+
logger.info(msg)
|
| 55 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
try:
|
| 58 |
# Add new files
|
|
|
|
| 61 |
payload_new = validate_point_payload(file)
|
| 62 |
point = PointStruct(
|
| 63 |
id=str(uuid.uuid4()), # Generate a new UUID for each point
|
| 64 |
+
vector={
|
| 65 |
+
'fast-bge-base-en': get_embedding(payload_new['purpose'])[0],
|
| 66 |
+
'fast-sparse-splade_pp_en_v1': get_embedding(payload_new['purpose'])[1]
|
| 67 |
+
},
|
| 68 |
payload={
|
| 69 |
"filename": payload_new['filename'],
|
| 70 |
"content": payload_new['content'],
|
|
|
|
| 97 |
# Update the existing point with the new file data
|
| 98 |
point = PointStruct(
|
| 99 |
id=point_id,
|
| 100 |
+
vector={
|
| 101 |
+
'fast-bge-base-en': get_embedding(payload_current['purpose'])[0],
|
| 102 |
+
'fast-sparse-splade_pp_en_v1': get_embedding(payload_current['purpose'])[1]
|
| 103 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
payload={
|
| 105 |
"filename": payload_current['filename'],
|
| 106 |
"content": file['content'],
|
src/search_qdrant/streamlit_app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
import pyperclip
|
| 3 |
from pathlib import Path
|
| 4 |
from src.fabrics_processor.database import initialize_qdrant_database
|
|
@@ -155,15 +156,14 @@ def update_database():
|
|
| 155 |
fabric_patterns_folder=config.fabric_patterns_folder
|
| 156 |
)
|
| 157 |
|
| 158 |
-
# Update the database
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
)
|
| 167 |
|
| 168 |
# Get updated collection info
|
| 169 |
collection_info = st.session_state.client.get_collection(config.embedding.collection_name)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
import pyperclip
|
| 4 |
from pathlib import Path
|
| 5 |
from src.fabrics_processor.database import initialize_qdrant_database
|
|
|
|
| 156 |
fabric_patterns_folder=config.fabric_patterns_folder
|
| 157 |
)
|
| 158 |
|
| 159 |
+
# Update the database
|
| 160 |
+
update_qdrant_database(
|
| 161 |
+
client=st.session_state.client,
|
| 162 |
+
collection_name=config.embedding.collection_name,
|
| 163 |
+
new_files=new_files,
|
| 164 |
+
modified_files=modified_files,
|
| 165 |
+
deleted_files=deleted_files
|
| 166 |
+
)
|
|
|
|
| 167 |
|
| 168 |
# Get updated collection info
|
| 169 |
collection_info = st.session_state.client.get_collection(config.embedding.collection_name)
|