Spaces:
Runtime error
Runtime error
| from sentence_transformers import SentenceTransformer | |
| from dotenv import load_dotenv, find_dotenv | |
| from dbcone import getDatabase | |
| from dbcone import getDatabaseIndex | |
| import os | |
| import uuid | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from summary import generate_combined_summary_and_key_points | |
| sentence_model = None | |
| inputDir = None | |
| outputDir = None | |
| topic = None | |
| db_index_name = None | |
| db_namespace_name = None | |
| def initialize_model(): | |
| global sentence_model | |
| sentence_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def get_model(): | |
| if sentence_model is None: | |
| initialize_model() | |
| return sentence_model | |
| def get_sentence_embedding(sentence): | |
| model = get_model() | |
| return model.encode(sentence) | |
| def getOutputDir(outputDirectory): | |
| outputDir = Path(outputDirectory) | |
| if not os.path.exists(outputDir): | |
| os.makedirs(outputDir) | |
| return outputDir | |
| def read_files(inputDirectory, outputDirectory, topic=None): | |
| inputDir = Path(inputDirectory) | |
| embeded_lst = [] | |
| if ( (not os.path.exists(inputDir)) or (not os.path.isdir(inputDir)) ): | |
| return embeded_lst | |
| files = os.listdir(inputDir) | |
| if topic is None: | |
| topic = os.path.basename(inputDir) | |
| if len(files) <= 0: | |
| return embeded_lst | |
| outputDir = getOutputDir(outputDirectory) | |
| for file in files: | |
| if file.endswith(".txt"): | |
| file_path = os.path.join(inputDir, file) | |
| if os.path.isfile(file_path): | |
| with open(file_path, 'r') as f: | |
| text = f.read() | |
| embedding = get_sentence_embedding(text) | |
| f.close() | |
| if not os.path.isfile(os.path.join(outputDir, file)): | |
| os.rename(file_path, os.path.join(outputDir, file)) | |
| else: | |
| os.remove(file_path) | |
| (topic_gen, summary, keypoints) = generate_combined_summary_and_key_points(text) | |
| if (topic_gen is not None): | |
| topic += " - " + topic_gen | |
| embeded_lst.append( | |
| { | |
| "id" : str(uuid.uuid4().hex), | |
| "metadata": { | |
| 'text':text, | |
| "topic": topic, | |
| "summary": summary, | |
| "keypoints":keypoints | |
| }, | |
| "values": embedding.tolist() | |
| } | |
| ) | |
| return embeded_lst | |
| def save_to_database(embeded_lst, index_name = 'test_videos' ,namespace="sample-namespace"): | |
| if len(embeded_lst) > 0 : | |
| db_index = getDatabaseIndex(index_name) | |
| db_index.upsert( | |
| vectors=embeded_lst, | |
| namespace=namespace | |
| ) | |
| def embed_text_files(inputDir, outputDir, topic): | |
| return read_files(inputDirectory=inputDir, outputDirectory=outputDir, topic=topic) | |
| def configureApp(given_topic): | |
| global inputDir, outputDir, topic, db_index_name, db_namespace_name | |
| currPath = Path.cwd() | |
| inputDir = os.path.join( currPath, 'output') | |
| outputDir = os.path.join(currPath, 'processed') | |
| topic = given_topic | |
| db_index_name = 'samplevideos' | |
| db_namespace_name="video-namespace" | |
| load_dotenv(find_dotenv('Keys1.env')) | |
| initialize_model() | |
| getDatabase() | |
| return True | |
| def fetch_from_database(search_text, topics =[] ,top_k = 5, index_name = 'test-videos' ,namespace="sample-namespace"): | |
| db_index = getDatabaseIndex(index_name) | |
| results = db_index.query(namespace=namespace, | |
| vector=np.array(get_sentence_embedding(search_text)).tolist(), | |
| top_k=top_k, | |
| include_values=True, | |
| include_metadata=True, | |
| filter={ | |
| "topic": {"$in": topics}, | |
| } | |
| ) | |
| return results | |
| def captureData(): | |
| global inputDir, outputDir, topic, db_index_name, db_namespace_name | |
| embeded_lst = embed_text_files(inputDir, outputDir, topic) | |
| save_to_database(embeded_lst, index_name =db_index_name, namespace=db_namespace_name) | |
| def queryRepository(search_text, topic): | |
| global db_index_name, db_namespace_name | |
| result = fetch_from_database(search_text, topics=[topic], index_name = db_index_name, namespace=db_namespace_name) | |
| print(f'Results: {result}') | |
| def mainApp(topic): | |
| configureApp(topic) | |
| captureData() | |
| if __name__ == "__main__": | |
| mainApp() | |