File size: 5,356 Bytes
49e67a8
 
 
 
 
 
f4d6026
49e67a8
 
 
 
f4d6026
 
49e67a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from transformers import pipeline

_ner_pipeline = None

def get_ner_pipeline():
    """
    Lazily load and return DistilBERT NER pipeline for location extraction.
    """
    global _ner_pipeline
    if _ner_pipeline is not None:
        return _ner_pipeline
    # Use DistilBERT NER model for better performance and efficiency
    ner_model_name = "dslim/distilbert-NER"
    try:
        _ner_pipeline = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")
    except Exception:
        # Return None to allow regex/location keyword fallback downstream
        _ner_pipeline = None
    return _ner_pipeline

def extract_hazard_and_locations(text):
    """
    Extract hazard keywords and locations from a single text.
    Returns dict: {hazards: [..], locations: [..]}
    """
    if not text or not text.strip():
        return {"hazards": [], "locations": []}

    hazard_keywords = [
        'Tsunami', 'High Waves', 'Coastal Flooding', 'Storm Surge', 
        'Rip Current', 'Coastal Erosion', 'Algal Bloom', 
        'Marine Pollution', 'Cyclone', 'flood'
    ]
    detected_hazards = []
    text_lower = text.lower()
    for hazard in hazard_keywords:
        if hazard.lower() in text_lower:
            detected_hazards.append(hazard)

    ner = get_ner_pipeline()
    locations = []
    if ner is not None:
        try:
            ner_results = ner(text)
            locations = [entity['word'] for entity in ner_results if entity.get('entity_group') == 'LOC']
        except Exception:
            locations = []
    # Fallback: simple keyword-based location spotting if NER unavailable
    if not locations:
        location_keywords = [
            "Mumbai","Chennai","Kolkata","Odisha","Kerala","Gujarat","Goa",
            "Andhra Pradesh","West Bengal","Vizag","Visakhapatnam","Puri",
            "Bay of Bengal","Arabian Sea","Tamil Nadu","Maharashtra","Karnataka",
            "Andaman","Nicobar","Lakshadweep","Kochi","Cochin","Mangaluru","Mangalore",
            "Chandipur","Paradip","Digha","Gopalpur"
        ]
        text_lower = text.lower()
        for name in location_keywords:
            if name.lower() in text_lower:
                locations.append(name)

    return {"hazards": detected_hazards, "locations": locations}

# Removed hard-coded demo runner; this module now only provides reusable functions.
    """
    Loads a Named Entity Recognition (NER) model to find locations and then
    searches the text for specific hazard-related keywords.
    """
    # --- 1. Load the NER Model for Location Extraction ---
    # UPDATED: Using the large, high-accuracy model as requested.
    ner_model_name = "Davlan/xlm-roberta-large-ner-hrl"
    print(f"Loading NER model: '{ner_model_name}'...")
    try:
        ner_pipeline = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")
        print("NER model loaded successfully!")
    except Exception as e:
        print(f"Failed to load NER model. Error: {e}")
        return

    # --- 2. Define the Hazard Keywords to search for ---
    # These are the exact phrases we will look for in the text.
    hazard_keywords = [
        'Tsunami', 'High Waves', 'Coastal Flooding', 'Storm Surge', 
        'Rip Current', 'Coastal Erosion', 'Algal Bloom', 
        'Marine Pollution', 'Cyclone', 'flood' # Added "flood" as a common variation
    ]

    # --- 3. Prepare Example Tweets for Analysis ---
    tweets_to_analyze = [
        "Major coastal flooding reported in Chennai due to the storm surge. All residents advised to stay indoors.",
        "Authorities have issued a tsunami warning for the entire Odisha coastline after the earthquake.",
        "The recent cyclone has caused severe coastal erosion near Puri beach.",
        "मुंबई में ऊंची लहरों की चेतावनी है, कृपया समुद्र तट से दूर रहें।", # Hindi: "Warning of high waves in Mumbai, please stay away from the beach."
        "Not a hazard: The sunset over the calm sea in Goa was beautiful today."
    ]

    print("\n--- Analyzing Tweets for Hazards and Locations ---")

    for tweet in tweets_to_analyze:
        try:
            # --- Step 1: Extract Locations using the NER model ---
            ner_results = ner_pipeline(tweet)
            # Filter the results to get only the words identified as locations ('LOC').
            locations = [entity['word'] for entity in ner_results if entity['entity_group'] == 'LOC']

            # --- Step 2: Extract Hazard Keywords directly from the text ---
            detected_hazards = []
            tweet_lower = tweet.lower()
            for hazard in hazard_keywords:
                # Check if the hazard keyword exists in the tweet (case-insensitive)
                if hazard.lower() in tweet_lower:
                    detected_hazards.append(hazard)

            # --- Print the structured results ---
            print(f"Text: '{tweet}'")
            print(f"  -> Location(s): {locations if locations else 'None Detected'}")
            print(f"  -> Detected Hazard(s): {detected_hazards if detected_hazards else 'None Detected'}")
            print("-" * 25)

        except Exception as e:
            print(f"Could not process tweet: '{tweet}'. Error: {e}")

if __name__ == "__main__":
    extract_hazard_info()