Spaces:

prathamesh788
/

pravaah

Sleeping

pravaah / scraper.py

Prathamesh Sutar

Deployment 2

f4d6026 about 1 month ago

6.21 kB

	import requests
	import json
	from datetime import date, timedelta

	from dotenv import load_dotenv
	import os

	# Load values from .env into environment
	load_dotenv()

	# Access the API key
	API_KEY = os.getenv("TWITTER_API_KEY", "").strip()


	def search_tweets(query, query_type="Latest", limit=20):
	"""
	Searches for tweets using the twitterapi.io advanced search endpoint.
	"""
	if not API_KEY:
	print("❌ Error: TWITTER_API_KEY not found or empty")
	return None

	url = "https://api.twitterapi.io/twitter/tweet/advanced_search"
	headers = {"X-API-Key": API_KEY.strip()}
	params = {"query": query, "queryType": query_type, "limit": limit}

	print(f"🔍 Executing search with query: {query}")
	try:
	response = requests.get(url, headers=headers, params=params)
	except Exception as e:
	print(f"❌ Request error: {e}")
	return None

	if response.status_code == 200:
	return response.json()
	else:
	print(f"Error: {response.status_code}")
	print(response.text)
	return None

	def extract_tweets(result_json):
	"""
	Extracts a normalized list of tweets from the API result.
	Returns a list of dicts with keys: tweet_url, location, created_at, text, hashtags
	"""
	if not result_json or 'tweets' not in result_json:
	return []
	tweets = result_json.get('tweets', [])
	extracted_data = []
	for tweet in tweets:
	tweet_url = tweet.get('url')
	text = tweet.get('text')
	created_at = tweet.get('createdAt')
	location = tweet.get('author', {}).get('location', None)
	hashtags = [tag['text'] for tag in tweet.get('entities', {}).get('hashtags', [])]
	extracted_data.append({
	'tweet_url': tweet_url,
	'location': location,
	'created_at': created_at,
	'text': text,
	'hashtags': hashtags
	})
	return extracted_data

	def build_custom_query(hazard_type=None, location=None, days_back=1):
	"""
	Builds a custom query based on provided hazard type and location.

	Args:
	hazard_type (str): Specific hazard type to search for
	location (str): Specific location to search for
	days_back (int): Number of days back to search (default: 1)

	Returns:
	str: Custom search query
	"""
	# Default hazard keywords
	default_hazards = [
	"flood", "tsunami", "cyclone", "storm surge", "high tide", "high waves",
	"swell", "coastal flooding", "rip current", "coastal erosion",
	"water discoloration", "algal bloom", "marine debris", "pollution"
	]

	# Default location keywords
	default_locations = [
	"Mumbai", "Chennai", "Kolkata", "Odisha", "Kerala", "Gujarat", "Goa",
	"Andhra Pradesh", "West Bengal", "Vizag", "Puri", "Bay of Bengal", "Arabian Sea"
	]

	# Build hazard query
	if hazard_type:
	# Use provided hazard type
	hazard_query = f'"{hazard_type}"'
	else:
	# Use default hazards
	hazard_query = "(" + " OR ".join([f'"{hazard}"' for hazard in default_hazards]) + ")"

	# Build location query
	if location:
	# Use provided location
	location_query = f'"{location}"'
	else:
	# Use default locations
	location_query = "(" + " OR ".join([f'"{loc}"' for loc in default_locations]) + ")"

	# Language filter
	allowed_languages = [
	"as", "bn", "brx", "doi", "gu", "hi", "kn", "ks", "kok", "ml", "mni",
	"mr", "ne", "or", "pa", "sa", "sat", "sd", "ta", "te", "ur", "en", "bh"
	]
	lang_query = "(" + " OR ".join([f"lang:{lang}" for lang in allowed_languages]) + ")"

	# Date filter
	search_date = date.today() - timedelta(days=days_back)
	date_filter = f"since:{search_date.strftime('%Y-%m-%d')}"

	# Combine all parts
	full_query = f"{hazard_query} {location_query} {lang_query} {date_filter}"
	return full_query

	def build_default_query():
	"""
	Builds the default hazard + India coastal locations + language + date query.
	"""
	return build_custom_query()

	def fetch_hazard_tweets(limit=20):
	"""
	Fetches tweets matching the default hazard query and returns extracted list.
	"""
	query = build_default_query()
	result = search_tweets(query=query, query_type="Latest", limit=limit)
	return extract_tweets(result)

	def fetch_custom_tweets(hazard_type=None, location=None, limit=20, days_back=1):
	"""
	Fetches tweets based on custom hazard type and location keywords.

	Args:
	hazard_type (str, optional): Specific hazard type to search for
	location (str, optional): Specific location to search for
	limit (int): Maximum number of tweets to fetch (default: 20)
	days_back (int): Number of days back to search (default: 1)

	Returns:
	list: List of extracted tweets
	"""
	query = build_custom_query(hazard_type=hazard_type, location=location, days_back=days_back)
	print(f"🔍 Custom search query: {query}")
	result = search_tweets(query=query, query_type="Latest", limit=limit)
	return extract_tweets(result)

	def get_available_hazards():
	"""
	Returns a list of available hazard types for keyword search.
	"""
	return [
	"flood", "tsunami", "cyclone", "storm surge", "high tide", "high waves",
	"swell", "coastal flooding", "rip current", "coastal erosion",
	"water discoloration", "algal bloom", "marine debris", "pollution"
	]

	def get_available_locations():
	"""
	Returns a list of available locations for keyword search.
	"""
	return [
	"Mumbai", "Chennai", "Kolkata", "Odisha", "Kerala", "Gujarat", "Goa",
	"Andhra Pradesh", "West Bengal", "Vizag", "Puri", "Bay of Bengal", "Arabian Sea",
	"Tamil Nadu", "Maharashtra", "Karnataka", "Andaman", "Nicobar", "Lakshadweep",
	"Kochi", "Cochin", "Mangaluru", "Mangalore", "Chandipur", "Paradip", "Digha", "Gopalpur"
	]

	if __name__ == "__main__":
	tweets = fetch_hazard_tweets(limit=20)
	if tweets:
	print("\nExtracted tweets:")
	print(json.dumps(tweets, indent=2, ensure_ascii=False))