File size: 2,525 Bytes
49e67a8
 
 
 
 
 
 
f4d6026
49e67a8
f0663fb
 
 
 
 
 
 
 
 
49e67a8
 
 
f4d6026
49e67a8
 
 
 
 
f0663fb
 
49e67a8
 
 
 
 
 
 
 
 
f4d6026
49e67a8
 
 
 
 
 
f4d6026
 
49e67a8
 
f4d6026
 
 
 
 
 
49e67a8
 
 
 
 
f4d6026
 
 
 
 
49e67a8
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from transformers import pipeline
from scraper import fetch_hazard_tweets
from translate import translate_to_english
from sentiment import classify_emotion_text
from ner import extract_hazard_and_locations
import json

model_name = "cross-encoder/nli-deberta-v3-base"

# Lazy loading - only load when needed
classifier = None

def get_classifier():
    """Lazy load classifier to avoid startup delay"""
    global classifier
    if classifier is None:
        classifier = pipeline("zero-shot-classification", model=model_name, framework="pt")
    return classifier

def classify_with_model(tweet_text):
    """
    Classifies a tweet using DeBERTa-v3 cross-encoder for zero-shot classification.
    Returns 1 if hazardous, else 0.
    """
    if not tweet_text or not tweet_text.strip():
        return 0
    candidate_labels = ["report of an ocean hazard", "not an ocean hazard"]
    classifier_instance = get_classifier()
    result = classifier_instance(tweet_text, candidate_labels)
    top_label = result['labels'][0]
    top_score = result['scores'][0]
    if top_label == "report of an ocean hazard" and top_score > 0.75:
        return 1
    return 0

def classify_tweets(tweets):
    """
    Accepts list of tweet dicts with 'text' field.
    Pipeline: translate -> classify hazard -> if hazardous, sentiment -> NER.
    Returns enriched dicts.
    """
    classified = []
    for t in tweets:
        text = t.get('text', '')
        item = dict(t)
        
        # Step 1: Translate ALL tweets first (more efficient)
        translated = translate_to_english(text)
        item['translated_text'] = translated
        
        # Step 2: Classify using translated text (more accurate)
        hazardous = classify_with_model(translated)
        item['hazardous'] = hazardous
        
        # Step 3: If hazardous, do additional analysis
        if hazardous == 1:
            sentiment = classify_emotion_text(translated)
            item['sentiment'] = sentiment
            ner_info = extract_hazard_and_locations(translated)
            item['ner'] = ner_info
        else:
            # For non-hazardous tweets, still extract basic info
            item['sentiment'] = {"label": "neutral", "score": 0.0}
            item['ner'] = {"hazards": [], "locations": []}
            
        classified.append(item)
    return classified

if __name__ == "__main__":
    tweets = fetch_hazard_tweets(limit=20)
    classified = classify_tweets(tweets)
    print(json.dumps(classified, indent=2, ensure_ascii=False))