Spaces:
Sleeping
Sleeping
File size: 5,356 Bytes
49e67a8 f4d6026 49e67a8 f4d6026 49e67a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from transformers import pipeline
_ner_pipeline = None
def get_ner_pipeline():
"""
Lazily load and return DistilBERT NER pipeline for location extraction.
"""
global _ner_pipeline
if _ner_pipeline is not None:
return _ner_pipeline
# Use DistilBERT NER model for better performance and efficiency
ner_model_name = "dslim/distilbert-NER"
try:
_ner_pipeline = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")
except Exception:
# Return None to allow regex/location keyword fallback downstream
_ner_pipeline = None
return _ner_pipeline
def extract_hazard_and_locations(text):
"""
Extract hazard keywords and locations from a single text.
Returns dict: {hazards: [..], locations: [..]}
"""
if not text or not text.strip():
return {"hazards": [], "locations": []}
hazard_keywords = [
'Tsunami', 'High Waves', 'Coastal Flooding', 'Storm Surge',
'Rip Current', 'Coastal Erosion', 'Algal Bloom',
'Marine Pollution', 'Cyclone', 'flood'
]
detected_hazards = []
text_lower = text.lower()
for hazard in hazard_keywords:
if hazard.lower() in text_lower:
detected_hazards.append(hazard)
ner = get_ner_pipeline()
locations = []
if ner is not None:
try:
ner_results = ner(text)
locations = [entity['word'] for entity in ner_results if entity.get('entity_group') == 'LOC']
except Exception:
locations = []
# Fallback: simple keyword-based location spotting if NER unavailable
if not locations:
location_keywords = [
"Mumbai","Chennai","Kolkata","Odisha","Kerala","Gujarat","Goa",
"Andhra Pradesh","West Bengal","Vizag","Visakhapatnam","Puri",
"Bay of Bengal","Arabian Sea","Tamil Nadu","Maharashtra","Karnataka",
"Andaman","Nicobar","Lakshadweep","Kochi","Cochin","Mangaluru","Mangalore",
"Chandipur","Paradip","Digha","Gopalpur"
]
text_lower = text.lower()
for name in location_keywords:
if name.lower() in text_lower:
locations.append(name)
return {"hazards": detected_hazards, "locations": locations}
# Removed hard-coded demo runner; this module now only provides reusable functions.
"""
Loads a Named Entity Recognition (NER) model to find locations and then
searches the text for specific hazard-related keywords.
"""
# --- 1. Load the NER Model for Location Extraction ---
# UPDATED: Using the large, high-accuracy model as requested.
ner_model_name = "Davlan/xlm-roberta-large-ner-hrl"
print(f"Loading NER model: '{ner_model_name}'...")
try:
ner_pipeline = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")
print("NER model loaded successfully!")
except Exception as e:
print(f"Failed to load NER model. Error: {e}")
return
# --- 2. Define the Hazard Keywords to search for ---
# These are the exact phrases we will look for in the text.
hazard_keywords = [
'Tsunami', 'High Waves', 'Coastal Flooding', 'Storm Surge',
'Rip Current', 'Coastal Erosion', 'Algal Bloom',
'Marine Pollution', 'Cyclone', 'flood' # Added "flood" as a common variation
]
# --- 3. Prepare Example Tweets for Analysis ---
tweets_to_analyze = [
"Major coastal flooding reported in Chennai due to the storm surge. All residents advised to stay indoors.",
"Authorities have issued a tsunami warning for the entire Odisha coastline after the earthquake.",
"The recent cyclone has caused severe coastal erosion near Puri beach.",
"मुंबई में ऊंची लहरों की चेतावनी है, कृपया समुद्र तट से दूर रहें।", # Hindi: "Warning of high waves in Mumbai, please stay away from the beach."
"Not a hazard: The sunset over the calm sea in Goa was beautiful today."
]
print("\n--- Analyzing Tweets for Hazards and Locations ---")
for tweet in tweets_to_analyze:
try:
# --- Step 1: Extract Locations using the NER model ---
ner_results = ner_pipeline(tweet)
# Filter the results to get only the words identified as locations ('LOC').
locations = [entity['word'] for entity in ner_results if entity['entity_group'] == 'LOC']
# --- Step 2: Extract Hazard Keywords directly from the text ---
detected_hazards = []
tweet_lower = tweet.lower()
for hazard in hazard_keywords:
# Check if the hazard keyword exists in the tweet (case-insensitive)
if hazard.lower() in tweet_lower:
detected_hazards.append(hazard)
# --- Print the structured results ---
print(f"Text: '{tweet}'")
print(f" -> Location(s): {locations if locations else 'None Detected'}")
print(f" -> Detected Hazard(s): {detected_hazards if detected_hazards else 'None Detected'}")
print("-" * 25)
except Exception as e:
print(f"Could not process tweet: '{tweet}'. Error: {e}")
if __name__ == "__main__":
extract_hazard_info()
|