add csv data indexing
Browse files
main.py
CHANGED
|
@@ -6,6 +6,8 @@ import json
|
|
| 6 |
import os
|
| 7 |
import logging
|
| 8 |
from txtai.embeddings import Embeddings
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Set up logging
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -105,6 +107,41 @@ async def query_index(request: QueryRequest):
|
|
| 105 |
logger.error(f"Error querying index: {str(e)}")
|
| 106 |
raise HTTPException(status_code=500, detail=f"Error querying index: {str(e)}")
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
if __name__ == "__main__":
|
| 109 |
import uvicorn
|
| 110 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 6 |
import os
|
| 7 |
import logging
|
| 8 |
from txtai.embeddings import Embeddings
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import glob
|
| 11 |
|
| 12 |
# Set up logging
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 107 |
logger.error(f"Error querying index: {str(e)}")
|
| 108 |
raise HTTPException(status_code=500, detail=f"Error querying index: {str(e)}")
|
| 109 |
|
| 110 |
+
def process_csv_file(file_path):
|
| 111 |
+
try:
|
| 112 |
+
df = pd.read_csv(file_path)
|
| 113 |
+
df_rows = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
| 114 |
+
txtai_data = [(i, row, None) for i, row in enumerate(df_rows)]
|
| 115 |
+
return txtai_data, df_rows.tolist()
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error processing CSV file {file_path}: {str(e)}")
|
| 118 |
+
return None, None
|
| 119 |
+
|
| 120 |
+
def check_and_index_csv_files():
|
| 121 |
+
index_data_folder = "/app/index_data"
|
| 122 |
+
if not os.path.exists(index_data_folder):
|
| 123 |
+
logger.warning(f"index_data folder not found: {index_data_folder}")
|
| 124 |
+
return
|
| 125 |
+
|
| 126 |
+
csv_files = glob.glob(os.path.join(index_data_folder, "*.csv"))
|
| 127 |
+
for csv_file in csv_files:
|
| 128 |
+
index_id = os.path.splitext(os.path.basename(csv_file))[0]
|
| 129 |
+
if not os.path.exists(f"/app/indexes/{index_id}"):
|
| 130 |
+
logger.info(f"Processing CSV file: {csv_file}")
|
| 131 |
+
txtai_data, documents = process_csv_file(csv_file)
|
| 132 |
+
if txtai_data and documents:
|
| 133 |
+
embeddings.index(txtai_data)
|
| 134 |
+
save_embeddings(index_id, documents)
|
| 135 |
+
logger.info(f"CSV file indexed successfully: {csv_file}")
|
| 136 |
+
else:
|
| 137 |
+
logger.warning(f"Failed to process CSV file: {csv_file}")
|
| 138 |
+
else:
|
| 139 |
+
logger.info(f"Index already exists for: {csv_file}")
|
| 140 |
+
|
| 141 |
+
@app.on_event("startup")
|
| 142 |
+
async def startup_event():
|
| 143 |
+
check_and_index_csv_files()
|
| 144 |
+
|
| 145 |
if __name__ == "__main__":
|
| 146 |
import uvicorn
|
| 147 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|