Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import gradio as gr | |
| from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
| import os | |
| import torch | |
| import spaces | |
| from datasets import Dataset, load_dataset, concatenate_datasets | |
| import time | |
| import datetime | |
| # Define model paths | |
| MODEL_PATHS = { | |
| "Terjman-Nano-v2": "BounharAbdelaziz/Terjman-Nano-v2.0", | |
| "Terjman-Large-v2": "BounharAbdelaziz/Terjman-Large-v2.0", | |
| "Terjman-Ultra-v2": "BounharAbdelaziz/Terjman-Ultra-v2.0", | |
| "Terjman-Supreme-v2": "BounharAbdelaziz/Terjman-Supreme-v2.0" | |
| } | |
| # Load environment tokens | |
| TOKEN = os.environ['TOKEN'] | |
| # Dataset configuration | |
| DATASET_REPO = "BounharAbdelaziz/terjman-v2-live-translations" | |
| # Number of translations to collect before pushing | |
| BATCH_SIZE = 10 | |
| # Time in seconds between pushes (1 hour) | |
| UPDATE_INTERVAL = 3600 | |
| # Initialize dataset tracking | |
| translations_buffer = [] | |
| last_push_time = time.time() | |
| def preload_models(): | |
| """ Preload models and tokenizers """ | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| print(f"[INFO] Using device: {device}") | |
| # Load Nano and Large models | |
| nano_large_models = {} | |
| for model_name in ["Terjman-Nano-v2", "Terjman-Large-v2"]: | |
| print(f"[INFO] Loading {model_name}...") | |
| translator = pipeline( | |
| "translation", | |
| model=MODEL_PATHS[model_name], | |
| token=TOKEN, | |
| device=device if device.startswith("cuda") else -1 | |
| ) | |
| nano_large_models[model_name] = translator | |
| # Load Ultra and Supreme models | |
| ultra_supreme_models = {} | |
| for model_name in ["Terjman-Ultra-v2", "Terjman-Supreme-v2"]: | |
| print(f"[INFO] Loading {model_name}...") | |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATHS[model_name], token=TOKEN).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_name], token=TOKEN) | |
| translator = pipeline( | |
| "translation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=device if device.startswith("cuda") else -1, | |
| src_lang="eng_Latn", | |
| tgt_lang="ary_Arab" | |
| ) | |
| ultra_supreme_models[model_name] = translator | |
| return nano_large_models, ultra_supreme_models | |
| def push_to_hf_dataset(): | |
| """ Save translations in HF dataset for monitoring, preserving previous data """ | |
| global translations_buffer, last_push_time | |
| if not translations_buffer: | |
| return | |
| try: | |
| print(f"[INFO] Pushing {len(translations_buffer)} translations to Hugging Face dataset...") | |
| # Create dataset from buffer | |
| new_data = Dataset.from_dict({ | |
| "source_text": [item["source_text"] for item in translations_buffer], | |
| "translated_text": [item["translated_text"] for item in translations_buffer], | |
| "model_used": [item["model_used"] for item in translations_buffer], | |
| "timestamp": [item["timestamp"] for item in translations_buffer], | |
| "user_id": [item["user_id"] for item in translations_buffer] # Include user ID | |
| }) | |
| # Try to load existing dataset | |
| try: | |
| existing_dataset = load_dataset(DATASET_REPO, split="live_translations", token=TOKEN) | |
| print(f"[INFO] Loaded existing dataset with {len(existing_dataset)} entries") | |
| # Concatenate existing data with new data | |
| combined_dataset = concatenate_datasets([existing_dataset, new_data]) | |
| print(f"[INFO] Combined dataset now has {len(combined_dataset)} entries") | |
| except Exception as e: | |
| print(f"[INFO] No existing dataset found or error loading: {str(e)}") | |
| print(f"[INFO] Creating new dataset") | |
| combined_dataset = new_data | |
| # Push to hub | |
| combined_dataset.push_to_hub( | |
| DATASET_REPO, | |
| token=TOKEN, | |
| split="live_translations", | |
| private=True, | |
| ) | |
| # Clear buffer and reset timer | |
| translations_buffer = [] | |
| last_push_time = time.time() | |
| print("[INFO] Successfully pushed translations to Hugging Face dataset") | |
| except Exception as e: | |
| print(f"[ERROR] Failed to push dataset to Hugging Face: {str(e)}") | |
| def translate_nano_large(text, model_name): | |
| """ Translation function for Nano and Large models """ | |
| translator = nano_large_models[model_name] | |
| translated = translator( | |
| text, | |
| max_length=512, | |
| num_beams=4, | |
| no_repeat_ngram_size=3, | |
| early_stopping=True, | |
| do_sample=False, | |
| pad_token_id=translator.tokenizer.pad_token_id, | |
| bos_token_id=translator.tokenizer.bos_token_id, | |
| eos_token_id=translator.tokenizer.eos_token_id, | |
| ) | |
| return translated[0]["translation_text"] | |
| def translate_ultra_supreme(text, model_name): | |
| """ Translation function for Ultra and Supreme models """ | |
| translator = ultra_supreme_models[model_name] | |
| translation = translator(text)[0]['translation_text'] | |
| return translation | |
| def translate_text(text, model_choice, request: gr.Request): | |
| """ Main translation function """ | |
| global translations_buffer, last_push_time | |
| # Skip empty text | |
| if not text or text.strip() == "": | |
| return "Please enter text to translate." | |
| # Get the user ID (if logged in) | |
| user_id = "anonymous" | |
| if request and hasattr(request, "username") and request.username: | |
| user_id = request.username | |
| # Perform translation | |
| if model_choice in ["Terjman-Nano-v2", "Terjman-Large-v2"]: | |
| translation = translate_nano_large(text, model_choice) | |
| elif model_choice in ["Terjman-Ultra-v2", "Terjman-Supreme-v2"]: | |
| translation = translate_ultra_supreme(text, model_choice) | |
| else: | |
| return "Invalid model selection." | |
| # Add to buffer | |
| translations_buffer.append({ | |
| "source_text": text, | |
| "translated_text": translation, | |
| "model_used": model_choice, | |
| "timestamp": datetime.datetime.now().isoformat(), | |
| "user_id": user_id # Add the user ID to the dataset | |
| }) | |
| # Check if it's time to push to HF | |
| current_time = time.time() | |
| if len(translations_buffer) >= BATCH_SIZE or (current_time - last_push_time) >= UPDATE_INTERVAL: | |
| push_to_hf_dataset() | |
| return translation | |
| def gradio_app(): | |
| with gr.Blocks() as app: | |
| gr.Markdown("# 🇲🇦 Terjman-v2") | |
| gr.Markdown("Choose a model and enter the English text you want to translate to Moroccan Darija.") | |
| model_choice = gr.Dropdown( | |
| label="Select Model", | |
| choices=["Terjman-Nano-v2", "Terjman-Large-v2", "Terjman-Ultra-v2", "Terjman-Supreme-v2"], | |
| value="Terjman-Ultra-v2" | |
| ) | |
| input_text = gr.Textbox(label="Input Text", placeholder="Enter text to translate...", lines=3) | |
| output_text = gr.Textbox(label="Translated Text", interactive=False, lines=3) | |
| translate_button = gr.Button("Translate") | |
| # Link input and output | |
| def translate_and_update_status(text, model): | |
| """Wrapper function to handle translation and update status.""" | |
| # Access the request object directly | |
| request = gr.Request() | |
| translation = translate_text(text, model, request) | |
| return translation | |
| translate_button.click( | |
| fn=translate_and_update_status, | |
| inputs=[input_text, model_choice], | |
| outputs=[output_text] | |
| ) | |
| return app | |
| # Run the app | |
| if __name__ == "__main__": | |
| # Register shutdown handler to save remaining translations | |
| import atexit | |
| atexit.register(push_to_hf_dataset) | |
| # Preload all models | |
| nano_large_models, ultra_supreme_models = preload_models() | |
| # Launch the app | |
| app = gradio_app() | |
| app.launch() | 
