Spaces:
Sleeping
Sleeping
| import json | |
| import dataclasses | |
| from uuid import UUID | |
| from typing import Any | |
| from datetime import datetime, date | |
| import configparser | |
| from torch import cuda | |
| from qdrant_client.http import models as rest | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.cross_encoders import HuggingFaceCrossEncoder | |
| def get_config(fp): | |
| config = configparser.ConfigParser() | |
| config.read_file(open(fp)) | |
| return config | |
| def get_embeddings_model(config): | |
| device = "cuda" if cuda.is_available() else "cpu" | |
| # Define embedding model | |
| model_name = config.get("retriever", "MODEL") | |
| model_kwargs = {"device": device} | |
| normalize_embeddings = bool(int(config.get("retriever", "NORMALIZE"))) | |
| encode_kwargs = { | |
| "normalize_embeddings": normalize_embeddings, | |
| "batch_size": 100, | |
| } | |
| embeddings = HuggingFaceEmbeddings( | |
| show_progress=True, | |
| model_name=model_name, | |
| model_kwargs=model_kwargs, | |
| encode_kwargs=encode_kwargs, | |
| ) | |
| return embeddings | |
| # Create a search filter for Qdrant | |
| def create_filter( | |
| reports: list = [], sources: str = None, subtype: str = None, year: str = None | |
| ): | |
| if len(reports) == 0: | |
| print(f"defining filter for sources:{sources}, subtype:{subtype}") | |
| filter = rest.Filter( | |
| must=[ | |
| rest.FieldCondition( | |
| key="metadata.source", match=rest.MatchValue(value=sources) | |
| ), | |
| rest.FieldCondition( | |
| key="metadata.filename", match=rest.MatchAny(any=subtype) | |
| ), | |
| # rest.FieldCondition( | |
| # key="metadata.year", | |
| # match=rest.MatchAny(any=year) | |
| ] | |
| ) | |
| else: | |
| print(f"defining filter for allreports:{reports}") | |
| filter = rest.Filter( | |
| must=[ | |
| rest.FieldCondition( | |
| key="metadata.filename", match=rest.MatchAny(any=reports) | |
| ) | |
| ] | |
| ) | |
| return filter | |
| def load_json(fp): | |
| with open(fp, "r") as f: | |
| docs = json.load(f) | |
| return docs | |
| def get_timestamp(): | |
| now = datetime.datetime.now() | |
| timestamp = now.strftime("%Y%m%d%H%M%S") | |
| return timestamp | |
| # A custom class to help with recursive serialization. | |
| # This approach avoids modifying the original object. | |
| class _RecursiveSerializer(json.JSONEncoder): | |
| """A custom JSONEncoder that handles complex types by converting them to dicts or strings.""" | |
| def default(self, obj): | |
| # Prefer the pydantic method if it exists for the most robust serialization. | |
| if hasattr(obj, 'model_dump'): | |
| return obj.model_dump() | |
| # Handle dataclasses | |
| if dataclasses.is_dataclass(obj): | |
| return dataclasses.asdict(obj) | |
| # Handle other non-serializable but common types. | |
| if isinstance(obj, (datetime, date, UUID)): | |
| return str(obj) | |
| # Fallback for general objects with a __dict__ | |
| if hasattr(obj, '__dict__'): | |
| return obj.__dict__ | |
| # Default fallback to JSONEncoder's behavior | |
| return super().default(obj) | |
| def to_json_string(obj: Any, **kwargs) -> str: | |
| """ | |
| Serializes a Python object into a JSON-formatted string. | |
| This function is a comprehensive utility that can handle: | |
| - Standard Python types (lists, dicts, strings, numbers, bools, None). | |
| - Pydantic models (using `model_dump()`). | |
| - Dataclasses (using `dataclasses.asdict()`). | |
| - Standard library types not natively JSON-serializable (e.g., datetime, UUID). | |
| - Custom classes with a `__dict__`. | |
| Args: | |
| obj (Any): The Python object to serialize. | |
| **kwargs: Additional keyword arguments to pass to `json.dumps`. | |
| Returns: | |
| str: A JSON-formatted string. | |
| Example: | |
| >>> from datetime import datetime | |
| >>> from pydantic import BaseModel | |
| >>> from dataclasses import dataclass | |
| >>> class Address(BaseModel): | |
| ... street: str | |
| ... city: str | |
| >>> @dataclass | |
| ... class Product: | |
| ... id: int | |
| ... name: str | |
| >>> class Order(BaseModel): | |
| ... user_address: Address | |
| ... item: Product | |
| >>> order_obj = Order( | |
| ... user_address=Address(street="123 Main St", city="Example City"), | |
| ... item=Product(id=1, name="Laptop") | |
| ... ) | |
| >>> print(to_json_string(order_obj, indent=2)) | |
| { | |
| "user_address": { | |
| "street": "123 Main St", | |
| "city": "Example City" | |
| }, | |
| "item": { | |
| "id": 1, | |
| "name": "Laptop" | |
| } | |
| } | |
| """ | |
| return json.dumps(obj, cls=_RecursiveSerializer, **kwargs) | |