Spaces:
Sleeping
Sleeping
File size: 2,525 Bytes
49e67a8 f4d6026 49e67a8 f0663fb 49e67a8 f4d6026 49e67a8 f0663fb 49e67a8 f4d6026 49e67a8 f4d6026 49e67a8 f4d6026 49e67a8 f4d6026 49e67a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from transformers import pipeline
from scraper import fetch_hazard_tweets
from translate import translate_to_english
from sentiment import classify_emotion_text
from ner import extract_hazard_and_locations
import json
model_name = "cross-encoder/nli-deberta-v3-base"
# Lazy loading - only load when needed
classifier = None
def get_classifier():
"""Lazy load classifier to avoid startup delay"""
global classifier
if classifier is None:
classifier = pipeline("zero-shot-classification", model=model_name, framework="pt")
return classifier
def classify_with_model(tweet_text):
"""
Classifies a tweet using DeBERTa-v3 cross-encoder for zero-shot classification.
Returns 1 if hazardous, else 0.
"""
if not tweet_text or not tweet_text.strip():
return 0
candidate_labels = ["report of an ocean hazard", "not an ocean hazard"]
classifier_instance = get_classifier()
result = classifier_instance(tweet_text, candidate_labels)
top_label = result['labels'][0]
top_score = result['scores'][0]
if top_label == "report of an ocean hazard" and top_score > 0.75:
return 1
return 0
def classify_tweets(tweets):
"""
Accepts list of tweet dicts with 'text' field.
Pipeline: translate -> classify hazard -> if hazardous, sentiment -> NER.
Returns enriched dicts.
"""
classified = []
for t in tweets:
text = t.get('text', '')
item = dict(t)
# Step 1: Translate ALL tweets first (more efficient)
translated = translate_to_english(text)
item['translated_text'] = translated
# Step 2: Classify using translated text (more accurate)
hazardous = classify_with_model(translated)
item['hazardous'] = hazardous
# Step 3: If hazardous, do additional analysis
if hazardous == 1:
sentiment = classify_emotion_text(translated)
item['sentiment'] = sentiment
ner_info = extract_hazard_and_locations(translated)
item['ner'] = ner_info
else:
# For non-hazardous tweets, still extract basic info
item['sentiment'] = {"label": "neutral", "score": 0.0}
item['ner'] = {"hazards": [], "locations": []}
classified.append(item)
return classified
if __name__ == "__main__":
tweets = fetch_hazard_tweets(limit=20)
classified = classify_tweets(tweets)
print(json.dumps(classified, indent=2, ensure_ascii=False)) |