File size: 1,973 Bytes
7c8c970 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
NEWS_API_KEY = "6e2fd3190a1a4b7aa695197ea2c4edfd"
TEXT_DATA_PATH = "data/raw/text/"
os.makedirs(TEXT_DATA_PATH, exist_ok=True)
def fetch_news(query="crypto OR bitcoin OR stock", from_days_ago=1):
end_date = datetime.now(timezone.utc)
start_date = end_date - timedelta(days=from_days_ago)
url = "https://newsapi.org/v2/everything"
params = {
"q": query,
"from": start_date.strftime('%Y-%m-%d'),
"to": end_date.strftime('%Y-%m-%d'),
"language": "en",
"sortBy": "publishedAt",
"apiKey": NEWS_API_KEY,
"pageSize": 100,
}
response = requests.get(url, params=params)
data = response.json()
if "articles" not in data:
raise Exception(f"β Error fetching news: {data}")
articles = data["articles"]
news_df = pd.DataFrame([{
"source": a["source"]["name"],
"title": a["title"],
"description": a["description"],
"publishedAt": a["publishedAt"],
"url": a["url"]
} for a in articles])
file_path = os.path.join(TEXT_DATA_PATH, f"news_{start_date.date()}_{end_date.date()}.csv")
news_df.to_csv(file_path, index=False)
print(f"β
News saved to {file_path}")
return news_df
def fetch_reddit_stub():
print("π Reddit data collection not implemented yet.")
print("π§ To implement, use PRAW (Reddit API) or Pushshift.io API.")
return pd.DataFrame()
def fetch_tweets_stub():
print("π Twitter/X data collection requires paid API access.")
print("π§ Use Tweepy or Twitter API v2 with Bearer Token if available.")
return pd.DataFrame()
if __name__ == "__main__":
print("π₯ Collecting financial text signals...")
news_df = fetch_news()
reddit_df = fetch_reddit_stub()
twitter_df = fetch_tweets_stub()
|