Spaces:

Canstralian
/

DockerTester

Paused

App Files Files Community

Canstralian commited on Dec 13, 2024

Commit

b044f34

verified ·

1 Parent(s): 3526e73

Create app.py

Browse files

Files changed (1) hide show

app.py +60 -0

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from fastapi import FastAPI, Query
+from datasets import load_dataset
+from typing import List
+app = FastAPI()
+# Load the dataset in streaming mode for memory efficiency
+dataset = load_dataset("togethercomputer/RedPajama-Data-1T", streaming=True)
+@app.get("/")
+def greet_json():
+    return {"message": "Welcome to the RedPajama Dataset API"}
+@app.get("/get_data/")
+def get_data(chunk_size: int = 10):
+    """
+    Returns a small chunk of the dataset.
+    Parameters:
+    - chunk_size: The number of examples to return (default: 10).
+    Returns:
+    - A list of examples from the dataset.
+    """
+    data_chunk = []
+    for i, example in enumerate(dataset["train"]):  # Adjust split if needed
+        data_chunk.append(example)
+        if i + 1 == chunk_size:
+            break
+    return {"data": data_chunk}
+@app.get("/search_data/")
+def search_data(keyword: str, max_results: int = 10):
+    """
+    Searches the dataset for a specific keyword in the text fields.
+    Parameters:
+    - keyword: The keyword to search for.
+    - max_results: The maximum number of results to return (default: 10).
+    Returns:
+    - A list of examples containing the keyword.
+    """
+    results = []
+    for example in dataset["train"]:  # Adjust split if needed
+        if keyword.lower() in str(example).lower():
+            results.append(example)
+        if len(results) == max_results:
+            break
+    return {"results": results}
+@app.get("/data_summary/")
+def data_summary():
+    """
+    Provides a basic summary of the dataset.
+    Returns:
+    - A dictionary with dataset details (e.g., number of splits).
+    """
+    return {"dataset_splits": dataset.keys()}