Spaces:
Paused
Paused
| from fastapi import FastAPI, Query | |
| from datasets import load_dataset | |
| from typing import List | |
| app = FastAPI() | |
| # Load the dataset in streaming mode for memory efficiency | |
| dataset = load_dataset("togethercomputer/RedPajama-Data-1T", streaming=True) | |
| def greet_json(): | |
| return {"message": "Welcome to the RedPajama Dataset API"} | |
| def get_data(chunk_size: int = 10): | |
| """ | |
| Returns a small chunk of the dataset. | |
| Parameters: | |
| - chunk_size: The number of examples to return (default: 10). | |
| Returns: | |
| - A list of examples from the dataset. | |
| """ | |
| data_chunk = [] | |
| for i, example in enumerate(dataset["train"]): # Adjust split if needed | |
| data_chunk.append(example) | |
| if i + 1 == chunk_size: | |
| break | |
| return {"data": data_chunk} | |
| def search_data(keyword: str, max_results: int = 10): | |
| """ | |
| Searches the dataset for a specific keyword in the text fields. | |
| Parameters: | |
| - keyword: The keyword to search for. | |
| - max_results: The maximum number of results to return (default: 10). | |
| Returns: | |
| - A list of examples containing the keyword. | |
| """ | |
| results = [] | |
| for example in dataset["train"]: # Adjust split if needed | |
| if keyword.lower() in str(example).lower(): | |
| results.append(example) | |
| if len(results) == max_results: | |
| break | |
| return {"results": results} | |
| def data_summary(): | |
| """ | |
| Provides a basic summary of the dataset. | |
| Returns: | |
| - A dictionary with dataset details (e.g., number of splits). | |
| """ | |
| return {"dataset_splits": dataset.keys()} |