vikramvasudevan commited on
Commit
b24fcf4
·
verified ·
1 Parent(s): 79ef79b

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.py +11 -3
  2. db.py +11 -10
config.py CHANGED
@@ -352,6 +352,15 @@ class SanatanConfig:
352
  "output_dir": "./output/kamba_ramayanam",
353
  "collection_name": "kamba_ramayanam_en",
354
  "unit": "verse",
 
 
 
 
 
 
 
 
 
355
  "metadata_fields": [
356
  {
357
  "name": "kandam",
@@ -668,7 +677,6 @@ class SanatanConfig:
668
  ):
669
  canonical_doc["text"] = canonical_doc["document"]
670
  canonical_doc["document"] = "-"
671
- canonical_doc["verse"] = resolve_field(
672
- config.get("unit_field", config.get("unit"))
673
- )
674
  return canonical_doc
 
352
  "output_dir": "./output/kamba_ramayanam",
353
  "collection_name": "kamba_ramayanam_en",
354
  "unit": "verse",
355
+ "unit_field": "verse_number",
356
+ "field_mapping": {
357
+ "chapter_name": lambda doc: f"{doc.get('kandam','')}",
358
+ "title": lambda doc: f"{doc.get('padalam_ta','')} - {doc.get('padalam_en','')}".strip(),
359
+ "author": lambda doc: "Kamban",
360
+ "unit_index": "verse_number",
361
+ "verse": lambda doc: int(doc.get("verse_number", "0")),
362
+ "relative_path": lambda doc: f"{doc.get('padalam_ta','')} - {doc.get('padalam_en','')}".strip(),
363
+ },
364
  "metadata_fields": [
365
  {
366
  "name": "kandam",
 
677
  ):
678
  canonical_doc["text"] = canonical_doc["document"]
679
  canonical_doc["document"] = "-"
680
+ verse = resolve_field(config.get("unit_field", config.get("unit")))
681
+ canonical_doc["verse"] = int(verse) if verse else 0
 
682
  return canonical_doc
db.py CHANGED
@@ -112,7 +112,7 @@ class SanatanDatabase:
112
  n_results=n_results,
113
  )
114
 
115
- def fetch_document_by_index(self, collection_name: str, index: int, unit_name : str):
116
  """
117
  Fetch one document at a time from a ChromaDB collection using pagination (index = 0-based).
118
 
@@ -133,18 +133,11 @@ class SanatanDatabase:
133
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
134
 
135
  try:
136
- # show a sample data record
137
- response = collection.get(
138
- limit=2,
139
- # offset=index, # pagination via offset
140
- include=["metadatas", "documents"],
141
- )
142
- print(response)
143
  response = collection.get(
144
  limit=1,
145
  # offset=index, # pagination via offset
146
  include=["metadatas", "documents"],
147
- where={unit_name: index}
148
  )
149
  except Exception as e:
150
  logger.error("Error fetching document: %s", e)
@@ -158,11 +151,19 @@ class SanatanDatabase:
158
  result = {"document": documents[0]}
159
  if metadatas:
160
  result.update(metadatas[0])
 
161
  return result
162
  else:
163
  print("No data available")
164
- return {"error": "No data available."}
 
 
 
 
 
 
165
 
 
166
 
167
  def search_semantic(
168
  self,
 
112
  n_results=n_results,
113
  )
114
 
115
+ def fetch_document_by_index(self, collection_name: str, index: int, unit_name: str):
116
  """
117
  Fetch one document at a time from a ChromaDB collection using pagination (index = 0-based).
118
 
 
133
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
134
 
135
  try:
 
 
 
 
 
 
 
136
  response = collection.get(
137
  limit=1,
138
  # offset=index, # pagination via offset
139
  include=["metadatas", "documents"],
140
+ where={"$or": [{unit_name: index}, {unit_name: str(index)}]},
141
  )
142
  except Exception as e:
143
  logger.error("Error fetching document: %s", e)
 
151
  result = {"document": documents[0]}
152
  if metadatas:
153
  result.update(metadatas[0])
154
+ print("raw data = ", result)
155
  return result
156
  else:
157
  print("No data available")
158
+ # show a sample data record
159
+ response = collection.get(
160
+ limit=2,
161
+ # offset=index, # pagination via offset
162
+ include=["metadatas", "documents"],
163
+ )
164
+ print("sample data : ",response)
165
 
166
+ return {"error": "No data available."}
167
 
168
  def search_semantic(
169
  self,