Spaces:

sklearn-docs
/

Visualizing_the_stock_market_structure

Runtime error

App Files Files Community

Visualizing_the_stock_market_structure / app.py

tushifire

Adding visualization function

12e8f06 over 2 years ago

raw

history blame

6.34 kB

	"""
	Demo is based on https://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html
	"""

	import sys
	import numpy as np
	import pandas as pd

	symbol_dict = {
	"TOT": "Total",
	"XOM": "Exxon",
	"CVX": "Chevron",
	"COP": "ConocoPhillips",
	"VLO": "Valero Energy",
	"MSFT": "Microsoft",
	"IBM": "IBM",
	"TWX": "Time Warner",
	"CMCSA": "Comcast",
	"CVC": "Cablevision",
	"YHOO": "Yahoo",
	"DELL": "Dell",
	"HPQ": "HP",
	"AMZN": "Amazon",
	"TM": "Toyota",
	"CAJ": "Canon",
	"SNE": "Sony",
	"F": "Ford",
	"HMC": "Honda",
	"NAV": "Navistar",
	"NOC": "Northrop Grumman",
	"BA": "Boeing",
	"KO": "Coca Cola",
	"MMM": "3M",
	"MCD": "McDonald's",
	"PEP": "Pepsi",
	"K": "Kellogg",
	"UN": "Unilever",
	"MAR": "Marriott",
	"PG": "Procter Gamble",
	"CL": "Colgate-Palmolive",
	"GE": "General Electrics",
	"WFC": "Wells Fargo",
	"JPM": "JPMorgan Chase",
	"AIG": "AIG",
	"AXP": "American express",
	"BAC": "Bank of America",
	"GS": "Goldman Sachs",
	"AAPL": "Apple",
	"SAP": "SAP",
	"CSCO": "Cisco",
	"TXN": "Texas Instruments",
	"XRX": "Xerox",
	"WMT": "Wal-Mart",
	"HD": "Home Depot",
	"GSK": "GlaxoSmithKline",
	"PFE": "Pfizer",
	"SNY": "Sanofi-Aventis",
	"NVS": "Novartis",
	"KMB": "Kimberly-Clark",
	"R": "Ryder",
	"GD": "General Dynamics",
	"RTN": "Raytheon",
	"CVS": "CVS",
	"CAT": "Caterpillar",
	"DD": "DuPont de Nemours",
	}


	symbols, names = np.array(sorted(symbol_dict.items())).T

	quotes = []

	for symbol in symbols:
	print("Fetching quote history for %r" % symbol, file=sys.stderr)
	url = (
	"https://raw.githubusercontent.com/scikit-learn/examples-data/"
	"master/financial-data/{}.csv"
	)
	quotes.append(pd.read_csv(url.format(symbol)))

	close_prices = np.vstack([q["close"] for q in quotes])
	open_prices = np.vstack([q["open"] for q in quotes])

	# The daily variations of the quotes are what carry the most information
	variation = close_prices - open_prices


	from sklearn import covariance

	alphas = np.logspace(-1.5, 1, num=10)
	edge_model = covariance.GraphicalLassoCV(alphas=alphas)

	# standardize the time series: using correlations rather than covariance
	# former is more efficient for structurerelations rather than covariance
	# former is more efficient for structure recovery
	X = variation.copy().T
	X /= X.std(axis=0)
	edge_model.fit(X)



	from sklearn import cluster

	_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
	n_labels = labels.max()


	# Finding a low-dimension embedding for visualization: find the best position of
	# the nodes (the stocks) on a 2D plane

	from sklearn import manifold

	node_position_model = manifold.LocallyLinearEmbedding(
	n_components=2, eigen_solver="dense", n_neighbors=6
	)

	embedding = node_position_model.fit_transform(X.T).T

	import matplotlib.pyplot as plt
	from matplotlib.collections import LineCollection

	def visualize_stocks():
	fig = plt.figure(1, facecolor="w", figsize=(10, 8))
	plt.clf()
	ax = plt.axes([0.0, 0.0, 1.0, 1.0])
	plt.axis("off")

	# Plot the graph of partial correlations
	partial_correlations = edge_model.precision_.copy()
	d = 1 / np.sqrt(np.diag(partial_correlations))
	partial_correlations *= d
	partial_correlations *= d[:, np.newaxis]
	non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02

	# Plot the nodes using the coordinates of our embedding
	plt.scatter(
	embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral
	)

	# Plot the edges
	start_idx, end_idx = np.where(non_zero)
	# a sequence of (line0, line1, line2), where::
	# linen = (x0, y0), (x1, y1), ... (xm, ym)
	segments = [
	[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
	]
	values = np.abs(partial_correlations[non_zero])
	lc = LineCollection(
	segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
	)
	lc.set_array(values)
	lc.set_linewidths(15 * values)
	ax.add_collection(lc)

	# Add a label to each node. The challenge here is that we want to
	# position the labels to avoid overlap with other labels
	for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):

	dx = x - embedding[0]
	dx[index] = 1
	dy = y - embedding[1]
	dy[index] = 1
	this_dx = dx[np.argmin(np.abs(dy))]
	this_dy = dy[np.argmin(np.abs(dx))]
	if this_dx > 0:
	horizontalalignment = "left"
	x = x + 0.002
	else:
	horizontalalignment = "right"
	x = x - 0.002
	if this_dy > 0:
	verticalalignment = "bottom"
	y = y + 0.002
	else:
	verticalalignment = "top"
	y = y - 0.002
	plt.text(
	x,
	y,
	name,
	size=10,
	horizontalalignment=horizontalalignment,
	verticalalignment=verticalalignment,
	bbox=dict(
	facecolor="w",
	edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
	alpha=0.6,
	),
	)

	plt.xlim(
	embedding[0].min() - 0.15 * embedding[0].ptp(),
	embedding[0].max() + 0.10 * embedding[0].ptp(),
	)
	plt.ylim(
	embedding[1].min() - 0.03 * embedding[1].ptp(),
	embedding[1].max() + 0.03 * embedding[1].ptp(),
	)

	return fig

	import gradio as gr

	title = " 📈 Visualizing the stock market structure 📈"

	with gr.Blocks(title=title) as demo:
	gr.Markdown(f"# {title}")
	gr.Markdown(" Data is of 56 stocks between the period of 2003 - 2008 <br>")
	gr.Markdown(" Stocks the move in together with each other are grouped together in a cluster <br>")

	gr.Markdown(" [Demo is based on sklearn docs](https://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html)")

	for i in range(n_labels + 1):
	gr.Markdown( f"Cluster {i + 1}: {', '.join(names[labels == i])}")

	btn = gr.Button(value="Visualize")
	btn.click(visualize_stocks, outputs= gr.Plot(label='Visualizing stock into clusters') )
	gr.Markdown( f"## In progress")
	demo.launch()