Spaces:

Ekimetrics
/

climate-question-answering

Running

timeki commited on Aug 27

Commit

2e56bdb

Merge hf-origin/main into pr/30

Resolved conflicts by keeping pr/30 changes which include:
- Azure AI Search implementation (replacing Pinecone)
- Updated talk_to_data functionality
- New dependencies and vectorstore wrapper
- Enhanced IPCC and DRIAS workflows

New files from main:
- climateqa/engine/talk_to_data/myVanna.py
- climateqa/engine/talk_to_data/plot.py
- climateqa/engine/talk_to_data/sql_query.py
- climateqa/engine/talk_to_data/talk_to_drias.py
- climateqa/engine/talk_to_data/utils.py
- climateqa/engine/talk_to_data/vanna_class.py

Files changed (6) hide show

climateqa/engine/talk_to_data/myVanna.py +13 -0
climateqa/engine/talk_to_data/plot.py +418 -0
climateqa/engine/talk_to_data/sql_query.py +114 -0
climateqa/engine/talk_to_data/talk_to_drias.py +317 -0
climateqa/engine/talk_to_data/utils.py +281 -0
climateqa/engine/talk_to_data/vanna_class.py +325 -0

climateqa/engine/talk_to_data/myVanna.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from dotenv import load_dotenv
+from climateqa.engine.talk_to_data.vanna_class import MyCustomVectorDB
+from vanna.openai import OpenAI_Chat
+import os
+load_dotenv()
+OPENAI_API_KEY = os.getenv('THEO_API_KEY')
+class MyVanna(MyCustomVectorDB, OpenAI_Chat):
+    def __init__(self, config=None):
+        MyCustomVectorDB.__init__(self, config=config)
+        OpenAI_Chat.__init__(self, config=config)

climateqa/engine/talk_to_data/plot.py ADDED Viewed

	@@ -0,0 +1,418 @@

+from typing import Callable, TypedDict
+from matplotlib.figure import figaspect
+import pandas as pd
+from plotly.graph_objects import Figure
+import plotly.graph_objects as go
+import plotly.express as px
+from climateqa.engine.talk_to_data.sql_query import (
+    indicator_for_given_year_query,
+    indicator_per_year_at_location_query,
+)
+from climateqa.engine.talk_to_data.config import INDICATOR_TO_UNIT
+class Plot(TypedDict):
+    """Represents a plot configuration in the DRIAS system.
+    This class defines the structure for configuring different types of plots
+    that can be generated from climate data.
+    Attributes:
+        name (str): The name of the plot type
+        description (str): A description of what the plot shows
+        params (list[str]): List of required parameters for the plot
+        plot_function (Callable[..., Callable[..., Figure]]): Function to generate the plot
+        sql_query (Callable[..., str]): Function to generate the SQL query for the plot
+    """
+    name: str
+    description: str
+    params: list[str]
+    plot_function: Callable[..., Callable[..., Figure]]
+    sql_query: Callable[..., str]
+def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
+    """Generates a function to plot indicator evolution over time at a location.
+    This function creates a line plot showing how a climate indicator changes
+    over time at a specific location. It handles temperature, precipitation,
+    and other climate indicators.
+    Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - location (str): The location to plot
+            - model (str): The climate model to use
+    Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
+    Example:
+        >>> plot_func = plot_indicator_evolution_at_location({
+        ...     'indicator_column': 'mean_temperature',
+        ...     'location': 'Paris',
+        ...     'model': 'ALL'
+        ... })
+        >>> fig = plot_func(df)
+    """
+    indicator = params["indicator_column"]
+    location = params["location"]
+    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = INDICATOR_TO_UNIT.get(indicator, "")
+    def plot_data(df: pd.DataFrame) -> Figure:
+        """Generates the actual plot from the data.
+        Args:
+            df (pd.DataFrame): DataFrame containing the data to plot
+        Returns:
+            Figure: A plotly Figure object showing the indicator evolution
+        """
+        fig = go.Figure()
+        if df['model'].nunique() != 1:
+            df_avg = df.groupby("year", as_index=False)[indicator].mean()
+            # Transform to list to avoid pandas encoding
+            indicators = df_avg[indicator].astype(float).tolist()
+            years = df_avg["year"].astype(int).tolist()
+            # Compute the 10-year rolling average
+            rolling_window = 10
+            sliding_averages = (
+                df_avg[indicator]
+                .rolling(window=rolling_window, min_periods=rolling_window)
+                .mean()
+                .astype(float)
+                .tolist()
+            )
+            model_label = "Model Average"
+            # Only add rolling average if we have enough data points
+            if len([x for x in sliding_averages if pd.notna(x)]) > 0:
+                # Sliding average dashed line
+                fig.add_scatter(
+                    x=years,
+                    y=sliding_averages,
+                    mode="lines",
+                    name="10 years rolling average",
+                    line=dict(dash="dash"),
+                    marker=dict(color="#d62728"),
+                    hovertemplate=f"10-year average: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
+                )
+        else:
+            df_model = df
+            # Transform to list to avoid pandas encoding
+            indicators = df_model[indicator].astype(float).tolist()
+            years = df_model["year"].astype(int).tolist()
+            # Compute the 10-year rolling average
+            rolling_window = 10
+            sliding_averages = (
+                df_model[indicator]
+                .rolling(window=rolling_window, min_periods=rolling_window)
+                .mean()
+                .astype(float)
+                .tolist()
+            )
+            model_label = f"Model : {df['model'].unique()[0]}"
+            # Only add rolling average if we have enough data points
+            if len([x for x in sliding_averages if pd.notna(x)]) > 0:
+                # Sliding average dashed line
+                fig.add_scatter(
+                    x=years,
+                    y=sliding_averages,
+                    mode="lines",
+                    name="10 years rolling average",
+                    line=dict(dash="dash"),
+                    marker=dict(color="#d62728"),
+                    hovertemplate=f"10-year average: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
+                )
+        # Indicator per year plot
+        fig.add_scatter(
+            x=years,
+            y=indicators,
+            name=f"Yearly {indicator_label}",
+            mode="lines",
+            marker=dict(color="#1f77b4"),
+            hovertemplate=f"{indicator_label}: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
+        )
+        fig.update_layout(
+            title=f"Plot of {indicator_label} in {location} ({model_label})",
+            xaxis_title="Year",
+            yaxis_title=f"{indicator_label} ({unit})",
+            template="plotly_white",
+        )
+        return fig
+    return plot_data
+indicator_evolution_at_location: Plot = {
+    "name": "Indicator evolution at location",
+    "description": "Plot an evolution of the indicator at a certain location",
+    "params": ["indicator_column", "location", "model"],
+    "plot_function": plot_indicator_evolution_at_location,
+    "sql_query": indicator_per_year_at_location_query,
+}
+def plot_indicator_number_of_days_per_year_at_location(
+    params: dict,
+) -> Callable[..., Figure]:
+    """Generates a function to plot the number of days per year for an indicator.
+    This function creates a bar chart showing the frequency of certain climate
+    events (like days above a temperature threshold) per year at a specific location.
+    Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - location (str): The location to plot
+            - model (str): The climate model to use
+    Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
+    """
+    indicator = params["indicator_column"]
+    location = params["location"]
+    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = INDICATOR_TO_UNIT.get(indicator, "")
+    def plot_data(df: pd.DataFrame) -> Figure:
+        """Generate the figure thanks to the dataframe
+        Args:
+            df (pd.DataFrame): pandas dataframe with the required data
+        Returns:
+            Figure: Plotly figure
+        """
+        fig = go.Figure()
+        if df['model'].nunique() != 1:
+            df_avg = df.groupby("year", as_index=False)[indicator].mean()
+            # Transform to list to avoid pandas encoding
+            indicators = df_avg[indicator].astype(float).tolist()
+            years = df_avg["year"].astype(int).tolist()
+            model_label = "Model Average"
+        else:
+            df_model = df
+            # Transform to list to avoid pandas encoding
+            indicators = df_model[indicator].astype(float).tolist()
+            years = df_model["year"].astype(int).tolist()
+            model_label = f"Model : {df['model'].unique()[0]}"
+        # Bar plot
+        fig.add_trace(
+            go.Bar(
+                x=years,
+                y=indicators,
+                width=0.5,
+                marker=dict(color="#1f77b4"),
+                hovertemplate=f"{indicator_label}: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
+            )
+        )
+        fig.update_layout(
+            title=f"{indicator_label} in {location} ({model_label})",
+            xaxis_title="Year",
+            yaxis_title=f"{indicator_label} ({unit})",
+            yaxis=dict(range=[0, max(indicators)]),
+            bargap=0.5,
+            template="plotly_white",
+        )
+        return fig
+    return plot_data
+indicator_number_of_days_per_year_at_location: Plot = {
+    "name": "Indicator number of days per year at location",
+    "description": "Plot a barchart of the number of days per year of a certain indicator at a certain location. It is appropriate for frequency indicator.",
+    "params": ["indicator_column", "location", "model"],
+    "plot_function": plot_indicator_number_of_days_per_year_at_location,
+    "sql_query": indicator_per_year_at_location_query,
+}
+def plot_distribution_of_indicator_for_given_year(
+    params: dict,
+) -> Callable[..., Figure]:
+    """Generates a function to plot the distribution of an indicator for a year.
+    This function creates a histogram showing the distribution of a climate
+    indicator across different locations for a specific year.
+    Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - year (str): The year to plot
+            - model (str): The climate model to use
+    Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
+    """
+    indicator = params["indicator_column"]
+    year = params["year"]
+    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = INDICATOR_TO_UNIT.get(indicator, "")
+    def plot_data(df: pd.DataFrame) -> Figure:
+        """Generate the figure thanks to the dataframe
+        Args:
+            df (pd.DataFrame): pandas dataframe with the required data
+        Returns:
+            Figure: Plotly figure
+        """
+        fig = go.Figure()
+        if df['model'].nunique() != 1:
+            df_avg = df.groupby(["latitude", "longitude"], as_index=False)[
+                indicator
+            ].mean()
+            # Transform to list to avoid pandas encoding
+            indicators = df_avg[indicator].astype(float).tolist()
+            model_label = "Model Average"
+        else:
+            df_model = df
+            # Transform to list to avoid pandas encoding
+            indicators = df_model[indicator].astype(float).tolist()
+            model_label = f"Model : {df['model'].unique()[0]}"
+        fig.add_trace(
+            go.Histogram(
+                x=indicators,
+                opacity=0.8,
+                histnorm="percent",
+                marker=dict(color="#1f77b4"),
+                hovertemplate=f"{indicator_label}: %{{x:.2f}} {unit}<br>Frequency: %{{y:.2f}}%<extra></extra>"
+            )
+        )
+        fig.update_layout(
+            title=f"Distribution of {indicator_label} in {year} ({model_label})",
+            xaxis_title=f"{indicator_label} ({unit})",
+            yaxis_title="Frequency (%)",
+            plot_bgcolor="rgba(0, 0, 0, 0)",
+            showlegend=False,
+        )
+        return fig
+    return plot_data
+distribution_of_indicator_for_given_year: Plot = {
+    "name": "Distribution of an indicator for a given year",
+    "description": "Plot an histogram of the distribution for a given year of the values of an indicator",
+    "params": ["indicator_column", "model", "year"],
+    "plot_function": plot_distribution_of_indicator_for_given_year,
+    "sql_query": indicator_for_given_year_query,
+}
+def plot_map_of_france_of_indicator_for_given_year(
+    params: dict,
+) -> Callable[..., Figure]:
+    """Generates a function to plot a map of France for an indicator.
+    This function creates a choropleth map of France showing the spatial
+    distribution of a climate indicator for a specific year.
+    Args:
+        params (dict): Dictionary containing:
+            - indicator_column (str): The column name for the indicator
+            - year (str): The year to plot
+            - model (str): The climate model to use
+    Returns:
+        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
+    """
+    indicator = params["indicator_column"]
+    year = params["year"]
+    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = INDICATOR_TO_UNIT.get(indicator, "")
+    def plot_data(df: pd.DataFrame) -> Figure:
+        fig = go.Figure()
+        if df['model'].nunique() != 1:
+            df_avg = df.groupby(["latitude", "longitude"], as_index=False)[
+                indicator
+            ].mean()
+            indicators = df_avg[indicator].astype(float).tolist()
+            latitudes = df_avg["latitude"].astype(float).tolist()
+            longitudes = df_avg["longitude"].astype(float).tolist()
+            model_label = "Model Average"
+        else:
+            df_model = df
+            # Transform to list to avoid pandas encoding
+            indicators = df_model[indicator].astype(float).tolist()
+            latitudes = df_model["latitude"].astype(float).tolist()
+            longitudes = df_model["longitude"].astype(float).tolist()
+            model_label = f"Model : {df['model'].unique()[0]}"
+        fig.add_trace(
+            go.Scattermapbox(
+                lat=latitudes,
+                lon=longitudes,
+                mode="markers",
+                marker=dict(
+                    size=10,
+                    color=indicators,  # Color mapped to values
+                    colorscale="Turbo",  # Color scale (can be 'Plasma', 'Jet', etc.)
+                    cmin=min(indicators),  # Minimum color range
+                    cmax=max(indicators),  # Maximum color range
+                    showscale=True,  # Show colorbar
+                ),
+                text=[f"{indicator_label}: {value:.2f} {unit}" for value in indicators],  # Add hover text showing the indicator value
+                hoverinfo="text"  # Only show the custom text on hover
+            )
+        )
+        fig.update_layout(
+            mapbox_style="open-street-map",  # Use OpenStreetMap
+            mapbox_zoom=3,
+            mapbox_center={"lat": 46.6, "lon": 2.0},
+            coloraxis_colorbar=dict(title=f"{indicator_label} ({unit})"),  # Add legend
+            title=f"{indicator_label} in {year} in France ({model_label}) " # Title
+        )
+        return fig
+    return plot_data
+map_of_france_of_indicator_for_given_year: Plot = {
+    "name": "Map of France of an indicator for a given year",
+    "description": "Heatmap on the map of France of the values of an in indicator for a given year",
+    "params": ["indicator_column", "year", "model"],
+    "plot_function": plot_map_of_france_of_indicator_for_given_year,
+    "sql_query": indicator_for_given_year_query,
+}
+PLOTS = [
+    indicator_evolution_at_location,
+    indicator_number_of_days_per_year_at_location,
+    distribution_of_indicator_for_given_year,
+    map_of_france_of_indicator_for_given_year,
+]

climateqa/engine/talk_to_data/sql_query.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from typing import TypedDict
+import duckdb
+import pandas as pd
+async def execute_sql_query(sql_query: str) -> pd.DataFrame:
+    """Executes a SQL query on the DRIAS database and returns the results.
+    This function connects to the DuckDB database containing DRIAS climate data
+    and executes the provided SQL query. It handles the database connection and
+    returns the results as a pandas DataFrame.
+    Args:
+        sql_query (str): The SQL query to execute
+    Returns:
+        pd.DataFrame: A DataFrame containing the query results
+    Raises:
+        duckdb.Error: If there is an error executing the SQL query
+    """
+    def _execute_query():
+        # Execute the query
+        con = duckdb.connect()
+        results = con.sql(sql_query).fetchdf()
+        # return fetched data
+        return results
+    # Run the query in a thread pool to avoid blocking
+    loop = asyncio.get_event_loop()
+    with ThreadPoolExecutor() as executor:
+        return await loop.run_in_executor(executor, _execute_query)
+class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
+    """Parameters for querying an indicator's values over time at a location.
+    This class defines the parameters needed to query climate indicator data
+    for a specific location over multiple years.
+    Attributes:
+        indicator_column (str): The column name for the climate indicator
+        latitude (str): The latitude coordinate of the location
+        longitude (str): The longitude coordinate of the location
+        model (str): The climate model to use (optional)
+    """
+    indicator_column: str
+    latitude: str
+    longitude: str
+    model: str
+def indicator_per_year_at_location_query(
+    table: str, params: IndicatorPerYearAtLocationQueryParams
+) -> str:
+    """SQL Query to get the evolution of an indicator per year at a certain location
+    Args:
+        table (str): sql table of the indicator
+        params (IndicatorPerYearAtLocationQueryParams) : dictionary with the required params for the query
+    Returns:
+        str: the sql query
+    """
+    indicator_column = params.get("indicator_column")
+    latitude = params.get("latitude")
+    longitude = params.get("longitude")
+    if indicator_column is None or latitude is None or longitude is None: # If one parameter is missing, returns an empty query
+        return ""
+    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
+    sql_query = f"SELECT year, {indicator_column}, model\nFROM {table}\nWHERE latitude = {latitude} \nAnd longitude = {longitude} \nOrder by Year"
+    return sql_query
+class IndicatorForGivenYearQueryParams(TypedDict, total=False):
+    """Parameters for querying an indicator's values across locations for a year.
+    This class defines the parameters needed to query climate indicator data
+    across different locations for a specific year.
+    Attributes:
+        indicator_column (str): The column name for the climate indicator
+        year (str): The year to query
+        model (str): The climate model to use (optional)
+    """
+    indicator_column: str
+    year: str
+    model: str
+def indicator_for_given_year_query(
+        table:str, params: IndicatorForGivenYearQueryParams
+) -> str:
+    """SQL Query to get the values of an indicator with their latitudes, longitudes and models for a given year
+    Args:
+        table (str): sql table of the indicator
+        params (IndicatorForGivenYearQueryParams): dictionarry with the required params for the query
+    Returns:
+        str: the sql query
+    """
+    indicator_column = params.get("indicator_column")
+    year = params.get('year')
+    if year is None or indicator_column is None: # If one parameter is missing, returns an empty query
+        return ""
+    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
+    sql_query = f"Select {indicator_column}, latitude, longitude, model\nFrom {table}\nWhere year = {year}"
+    return sql_query

climateqa/engine/talk_to_data/talk_to_drias.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os
+from typing import Any, Callable, TypedDict, Optional
+from numpy import sort
+import pandas as pd
+import asyncio
+from plotly.graph_objects import Figure
+from climateqa.engine.llm import get_llm
+from climateqa.engine.talk_to_data import sql_query
+from climateqa.engine.talk_to_data.config import INDICATOR_COLUMNS_PER_TABLE
+from climateqa.engine.talk_to_data.plot import PLOTS, Plot
+from climateqa.engine.talk_to_data.sql_query import execute_sql_query
+from climateqa.engine.talk_to_data.utils import (
+    detect_relevant_plots,
+    detect_year_with_openai,
+    loc2coords,
+    detect_location_with_openai,
+    nearestNeighbourSQL,
+    detect_relevant_tables,
+)
+ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
+class TableState(TypedDict):
+    """Represents the state of a table in the DRIAS workflow.
+    This class defines the structure for tracking the state of a table during the
+    data processing workflow, including its name, parameters, SQL query, and results.
+    Attributes:
+        table_name (str): The name of the table in the database
+        params (dict[str, Any]): Parameters used for querying the table
+        sql_query (str, optional): The SQL query used to fetch data
+        dataframe (pd.DataFrame | None, optional): The resulting data
+        figure (Callable[..., Figure], optional): Function to generate visualization
+        status (str): The current status of the table processing ('OK' or 'ERROR')
+    """
+    table_name: str
+    params: dict[str, Any]
+    sql_query: Optional[str]
+    dataframe: Optional[pd.DataFrame | None]
+    figure: Optional[Callable[..., Figure]]
+    status: str
+class PlotState(TypedDict):
+    """Represents the state of a plot in the DRIAS workflow.
+    This class defines the structure for tracking the state of a plot during the
+    data processing workflow, including its name and associated tables.
+    Attributes:
+        plot_name (str): The name of the plot
+        tables (list[str]): List of tables used in the plot
+        table_states (dict[str, TableState]): States of the tables used in the plot
+    """
+    plot_name: str
+    tables: list[str]
+    table_states: dict[str, TableState]
+class State(TypedDict):
+    user_input: str
+    plots: list[str]
+    plot_states: dict[str, PlotState]
+    error: Optional[str]
+async def find_relevant_plots(state: State, llm) -> list[str]:
+    print("---- Find relevant plots ----")
+    relevant_plots = await detect_relevant_plots(state['user_input'], llm)
+    return relevant_plots
+async def find_relevant_tables_per_plot(state: State, plot: Plot, llm) -> list[str]:
+    print(f"---- Find relevant tables for {plot['name']} ----")
+    relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm)
+    return relevant_tables
+async def find_param(state: State, param_name:str, table: str) -> dict[str, Any] | None:
+    """Perform the good method to retrieve the desired parameter
+    Args:
+        state (State): state of the workflow
+        param_name (str): name of the desired parameter
+        table (str): name of the table
+    Returns:
+        dict[str, Any] | None:
+    """
+    if param_name == 'location':
+        location = await find_location(state['user_input'], table)
+        return location
+    if param_name == 'year':
+        year = await find_year(state['user_input'])
+        return {'year': year}
+    return None
+class Location(TypedDict):
+    location: str
+    latitude: Optional[str]
+    longitude: Optional[str]
+async def find_location(user_input: str, table: str) -> Location:
+    print(f"---- Find location in table {table} ----")
+    location = await detect_location_with_openai(user_input)
+    output: Location = {'location' : location}
+    if location:
+        coords = loc2coords(location)
+        neighbour = nearestNeighbourSQL(coords, table)
+        output.update({
+            "latitude": neighbour[0],
+            "longitude": neighbour[1],
+        })
+    return output
+async def find_year(user_input: str) -> str:
+    """Extracts year information from user input using LLM.
+    This function uses an LLM to identify and extract year information from the
+    user's query, which is used to filter data in subsequent queries.
+    Args:
+        user_input (str): The user's query text
+    Returns:
+        str: The extracted year, or empty string if no year found
+    """
+    print(f"---- Find year ---")
+    year = await detect_year_with_openai(user_input)
+    return year
+def find_indicator_column(table: str) -> str:
+    """Retrieves the name of the indicator column within a table.
+    This function maps table names to their corresponding indicator columns
+    using the predefined mapping in INDICATOR_COLUMNS_PER_TABLE.
+    Args:
+        table (str): Name of the table in the database
+    Returns:
+        str: Name of the indicator column for the specified table
+    Raises:
+        KeyError: If the table name is not found in the mapping
+    """
+    print(f"---- Find indicator column in table {table} ----")
+    return INDICATOR_COLUMNS_PER_TABLE[table]
+async def process_table(
+    table: str,
+    params: dict[str, Any],
+    plot: Plot,
+) -> TableState:
+    """Processes a table to extract relevant data and generate visualizations.
+    This function retrieves the SQL query for the specified table, executes it,
+    and generates a visualization based on the results.
+    Args:
+        table (str): The name of the table to process
+        params (dict[str, Any]): Parameters used for querying the table
+        plot (Plot): The plot object containing SQL query and visualization function
+    Returns:
+        TableState: The state of the processed table
+    """
+    table_state: TableState = {
+        'table_name': table,
+        'params': params.copy(),
+        'status': 'OK',
+        'dataframe': None,
+        'sql_query': None,
+        'figure': None
+    }
+    table_state['params']['indicator_column'] = find_indicator_column(table)
+    sql_query = plot['sql_query'](table, table_state['params'])
+    if sql_query == "":
+        table_state['status'] = 'ERROR'
+        return table_state
+    table_state['sql_query'] = sql_query
+    df = await execute_sql_query(sql_query)
+    table_state['dataframe'] = df
+    table_state['figure'] = plot['plot_function'](table_state['params'])
+    return table_state
+async def drias_workflow(user_input: str) -> State:
+    """Performs the complete workflow of Talk To Drias : from user input to sql queries, dataframes and figures generated
+    Args:
+        user_input (str): initial user input
+    Returns:
+        State: Final state with all the results
+    """
+    state: State = {
+        'user_input': user_input,
+        'plots': [],
+        'plot_states': {},
+        'error': ''
+    }
+    llm = get_llm(provider="openai")
+    plots = await find_relevant_plots(state, llm)
+    state['plots'] = plots
+    if len(state['plots']) < 1:
+        state['error'] = 'There is no plot to answer to the question'
+        return state
+    have_relevant_table = False
+    have_sql_query = False
+    have_dataframe = False
+    for plot_name in state['plots']:
+        plot = next((p for p in PLOTS if p['name'] == plot_name), None) # Find the associated plot object
+        if plot is None:
+            continue
+        plot_state: PlotState = {
+            'plot_name': plot_name,
+            'tables': [],
+            'table_states': {}
+        }
+        plot_state['plot_name'] = plot_name
+        relevant_tables = await find_relevant_tables_per_plot(state, plot, llm)
+        if len(relevant_tables) > 0 :
+            have_relevant_table = True
+        plot_state['tables'] = relevant_tables
+        params = {}
+        for param_name in plot['params']:
+            param = await find_param(state, param_name, relevant_tables[0])
+            if param:
+                params.update(param)
+        tasks = [process_table(table, params, plot) for table in plot_state['tables'][:3]]
+        results = await asyncio.gather(*tasks)
+        # Store results back in plot_state
+        have_dataframe = False
+        have_sql_query = False
+        for table_state in results:
+            if table_state['sql_query']:
+                have_sql_query = True
+            if table_state['dataframe'] is not None and len(table_state['dataframe']) > 0:
+                have_dataframe = True
+            plot_state['table_states'][table_state['table_name']] = table_state
+        state['plot_states'][plot_name] = plot_state
+    if not have_relevant_table:
+        state['error'] = "There is no relevant table in our database to answer your question"
+    elif not have_sql_query:
+        state['error'] = "There is no relevant sql query on our database that can help to answer your question"
+    elif not have_dataframe:
+        state['error'] = "There is no data in our table that can answer to your question"
+    return state
+# def make_write_query_node():
+#     def write_query(state):
+#         print("---- Write query ----")
+#         for table in state["tables"]:
+#             sql_query = QUERIES[state[table]['query_type']](
+#                 table=table,
+#                 indicator_column=state[table]["columns"],
+#                 longitude=state[table]["longitude"],
+#                 latitude=state[table]["latitude"],
+#             )
+#             state[table].update({"sql_query": sql_query})
+#         return state
+#     return write_query
+# def make_fetch_data_node(db_path):
+#     def fetch_data(state):
+#         print("---- Fetch data ----")
+#         for table in state["tables"]:
+#             results = execute_sql_query(db_path, state[table]['sql_query'])
+#             state[table].update(results)
+#         return state
+#     return fetch_data
+## V2
+# def make_fetch_data_node(db_path: str, llm):
+#     def fetch_data(state):
+#         print("---- Fetch data ----")
+#         db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
+#         output = {}
+#         sql_query = write_sql_query(state["query"], db, state["tables"], llm)
+#         # TO DO : Add query checker
+#         print(f"SQL query  : {sql_query}")
+#         output["sql_query"] = sql_query
+#         output.update(fetch_data_from_sql_query(db_path, sql_query))
+#         return output
+#     return fetch_data

climateqa/engine/talk_to_data/utils.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import re
+from typing import Annotated, TypedDict
+import duckdb
+from geopy.geocoders import Nominatim
+import ast
+from climateqa.engine.llm import get_llm
+from climateqa.engine.talk_to_data.config import DRIAS_TABLES
+from climateqa.engine.talk_to_data.plot import PLOTS, Plot
+from langchain_core.prompts import ChatPromptTemplate
+async def detect_location_with_openai(sentence):
+    """
+    Detects locations in a sentence using OpenAI's API via LangChain.
+    """
+    llm = get_llm()
+    prompt = f"""
+    Extract all locations (cities, countries, states, or geographical areas) mentioned in the following sentence.
+    Return the result as a Python list. If no locations are mentioned, return an empty list.
+    Sentence: "{sentence}"
+    """
+    response = await llm.ainvoke(prompt)
+    location_list = ast.literal_eval(response.content.strip("```python\n").strip())
+    if location_list:
+        return location_list[0]
+    else:
+        return ""
+class ArrayOutput(TypedDict):
+    """Represents the output of a function that returns an array.
+    This class is used to type-hint functions that return arrays,
+    ensuring consistent return types across the codebase.
+    Attributes:
+        array (str): A syntactically valid Python array string
+    """
+    array: Annotated[str, "Syntactically valid python array."]
+async def detect_year_with_openai(sentence: str) -> str:
+    """
+    Detects years in a sentence using OpenAI's API via LangChain.
+    """
+    llm = get_llm()
+    prompt = """
+    Extract all years mentioned in the following sentence.
+    Return the result as a Python list. If no year are mentioned, return an empty list.
+    Sentence: "{sentence}"
+    """
+    prompt = ChatPromptTemplate.from_template(prompt)
+    structured_llm = llm.with_structured_output(ArrayOutput)
+    chain = prompt | structured_llm
+    response: ArrayOutput = await chain.ainvoke({"sentence": sentence})
+    years_list = eval(response['array'])
+    if len(years_list) > 0:
+        return years_list[0]
+    else:
+        return ""
+def detectTable(sql_query: str) -> list[str]:
+    """Extracts table names from a SQL query.
+    This function uses regular expressions to find all table names
+    referenced in a SQL query's FROM clause.
+    Args:
+        sql_query (str): The SQL query to analyze
+    Returns:
+        list[str]: A list of table names found in the query
+    Example:
+        >>> detectTable("SELECT * FROM temperature_data WHERE year > 2000")
+        ['temperature_data']
+    """
+    pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
+    matches = re.findall(pattern, sql_query)
+    return matches
+def loc2coords(location: str) -> tuple[float, float]:
+    """Converts a location name to geographic coordinates.
+    This function uses the Nominatim geocoding service to convert
+    a location name (e.g., city name) to its latitude and longitude.
+    Args:
+        location (str): The name of the location to geocode
+    Returns:
+        tuple[float, float]: A tuple containing (latitude, longitude)
+    Raises:
+        AttributeError: If the location cannot be found
+    """
+    geolocator = Nominatim(user_agent="city_to_latlong")
+    coords = geolocator.geocode(location)
+    return (coords.latitude, coords.longitude)
+def coords2loc(coords: tuple[float, float]) -> str:
+    """Converts geographic coordinates to a location name.
+    This function uses the Nominatim reverse geocoding service to convert
+    latitude and longitude coordinates to a human-readable location name.
+    Args:
+        coords (tuple[float, float]): A tuple containing (latitude, longitude)
+    Returns:
+        str: The address of the location, or "Unknown Location" if not found
+    Example:
+        >>> coords2loc((48.8566, 2.3522))
+        'Paris, France'
+    """
+    geolocator = Nominatim(user_agent="coords_to_city")
+    try:
+        location = geolocator.reverse(coords)
+        return location.address
+    except Exception as e:
+        print(f"Error: {e}")
+        return "Unknown Location"
+def nearestNeighbourSQL(location: tuple, table: str) -> tuple[str, str]:
+    long = round(location[1], 3)
+    lat = round(location[0], 3)
+    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
+    results = duckdb.sql(
+        f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}"
+    ).fetchdf()
+    if len(results) == 0:
+        return "", ""
+    # cursor.execute(f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}")
+    return results['latitude'].iloc[0], results['longitude'].iloc[0]
+async def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[str]:
+    """Identifies relevant tables for a plot based on user input.
+    This function uses an LLM to analyze the user's question and the plot
+    description to determine which tables in the DRIAS database would be
+    most relevant for generating the requested visualization.
+    Args:
+        user_question (str): The user's question about climate data
+        plot (Plot): The plot configuration object
+        llm: The language model instance to use for analysis
+    Returns:
+        list[str]: A list of table names that are relevant for the plot
+    Example:
+        >>> detect_relevant_tables(
+        ...     "What will the temperature be like in Paris?",
+        ...     indicator_evolution_at_location,
+        ...     llm
+        ... )
+        ['mean_annual_temperature', 'mean_summer_temperature']
+    """
+    # Get all table names
+    table_names_list = DRIAS_TABLES
+    prompt = (
+        f"You are helping to build a plot following this description : {plot['description']}."
+        f"You are given a list of tables and a user question."
+        f"Based on the description of the plot, which table are appropriate for that kind of plot."
+        f"Write the 3 most relevant tables to use. Answer only a python list of table name."
+        f"### List of tables : {table_names_list}"
+        f"### User question : {user_question}"
+        f"### List of table name : "
+    )
+    table_names = ast.literal_eval(
+        (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
+    )
+    return table_names
+def replace_coordonates(coords, query, coords_tables):
+    n = query.count(str(coords[0]))
+    for i in range(n):
+        query = query.replace(str(coords[0]), str(coords_tables[i][0]), 1)
+        query = query.replace(str(coords[1]), str(coords_tables[i][1]), 1)
+    return query
+async def detect_relevant_plots(user_question: str, llm):
+    plots_description = ""
+    for plot in PLOTS:
+        plots_description += "Name: " + plot["name"]
+        plots_description += " - Description: " + plot["description"] + "\n"
+    prompt = (
+        f"You are helping to answer a quesiton with insightful visualizations."
+        f"You are given an user question and a list of plots with their name and description."
+        f"Based on the descriptions of the plots, which plot is appropriate to answer to this question."
+        f"Write the most relevant tables to use. Answer only a python list of plot name."
+        f"### Descriptions of the plots : {plots_description}"
+        f"### User question : {user_question}"
+        f"### Name of the plot : "
+    )
+    # prompt = (
+    #     f"You are helping to answer a question with insightful visualizations. "
+    #     f"Given a list of plots with their name and description: "
+    #     f"{plots_description} "
+    #     f"The user question is: {user_question}. "
+    #     f"Choose the most relevant plots to answer the question. "
+    #     f"The answer must be a Python list with the names of the relevant plots, and nothing else. "
+    #     f"Ensure the response is in the exact format: ['PlotName1', 'PlotName2']."
+    # )
+    plot_names = ast.literal_eval(
+        (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
+    )
+    return plot_names
+# Next Version
+# class QueryOutput(TypedDict):
+#     """Generated SQL query."""
+#     query: Annotated[str, ..., "Syntactically valid SQL query."]
+# class PlotlyCodeOutput(TypedDict):
+#     """Generated Plotly code"""
+#     code: Annotated[str, ..., "Synatically valid Plotly python code."]
+# def write_sql_query(user_input: str, db: SQLDatabase, relevant_tables: list[str], llm):
+#     """Generate SQL query to fetch information."""
+#     prompt_params = {
+#         "dialect": db.dialect,
+#         "table_info": db.get_table_info(),
+#         "input": user_input,
+#         "relevant_tables": relevant_tables,
+#         "model": "ALADIN63_CNRM-CM5",
+#     }
+#     prompt = ChatPromptTemplate.from_template(query_prompt_template)
+#     structured_llm = llm.with_structured_output(QueryOutput)
+#     chain = prompt | structured_llm
+#     result = chain.invoke(prompt_params)
+#     return result["query"]
+# def fetch_data_from_sql_query(db: str, sql_query: str):
+#     conn = sqlite3.connect(db)
+#     cursor = conn.cursor()
+#     cursor.execute(sql_query)
+#     column_names = [desc[0] for desc in cursor.description]
+#     values = cursor.fetchall()
+#     return {"column_names": column_names, "data": values}
+# def generate_chart_code(user_input: str, sql_query: list[str], llm):
+#     """ "Generate plotly python code for the chart based on the sql query and the user question"""
+#     class PlotlyCodeOutput(TypedDict):
+#         """Generated Plotly code"""
+#         code: Annotated[str, ..., "Synatically valid Plotly python code."]
+#     prompt = ChatPromptTemplate.from_template(plot_prompt_template)
+#     structured_llm = llm.with_structured_output(PlotlyCodeOutput)
+#     chain = prompt | structured_llm
+#     result = chain.invoke({"input": user_input, "sql_query": sql_query})
+#     return result["code"]

climateqa/engine/talk_to_data/vanna_class.py ADDED Viewed

	@@ -0,0 +1,325 @@

+from vanna.base import VannaBase
+from pinecone import Pinecone
+from climateqa.engine.embeddings import get_embeddings_function
+import pandas as pd
+import hashlib
+class MyCustomVectorDB(VannaBase):
+    """
+    VectorDB class for storing and retrieving vectors from Pinecone.
+    args :
+        config (dict) : Configuration dictionary containing the Pinecone API key and the index name :
+            - pc_api_key (str) : Pinecone API key
+            - index_name (str) : Pinecone index name
+            - top_k (int) : Number of top results to return (default = 2)
+    """
+    def __init__(self,config):
+        super().__init__(config = config)
+        try :
+            self.api_key = config.get('pc_api_key')
+            self.index_name = config.get('index_name')
+        except :
+            raise Exception("Please provide the Pinecone API key and the index name")
+        self.pc = Pinecone(api_key = self.api_key)
+        self.index = self.pc.Index(self.index_name)
+        self.top_k = config.get('top_k', 2)
+        self.embeddings = get_embeddings_function()
+    def check_embedding(self, id, namespace):
+        fetched = self.index.fetch(ids = [id], namespace = namespace)
+        if fetched['vectors'] == {}:
+            return False
+        return True
+    def generate_hash_id(self, data: str) -> str:
+        """
+        Generate a unique hash ID for the given data.
+        Args:
+            data (str): The input data to hash (e.g., a concatenated string of user attributes).
+        Returns:
+            str: A unique hash ID as a hexadecimal string.
+        """
+        data_bytes = data.encode('utf-8')
+        hash_object = hashlib.sha256(data_bytes)
+        hash_id = hash_object.hexdigest()
+        return hash_id
+    def add_ddl(self, ddl: str, **kwargs) -> str:
+        id = self.generate_hash_id(ddl) + '_ddl'
+        if self.check_embedding(id, 'ddl'):
+            print(f"DDL having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(ddl), {'ddl': ddl})],
+            namespace = 'ddl'
+        )
+        return id
+    def add_documentation(self, doc: str, **kwargs) -> str:
+        id = self.generate_hash_id(doc) + '_doc'
+        if self.check_embedding(id, 'documentation'):
+            print(f"Documentation having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(doc), {'doc': doc})],
+            namespace = 'documentation'
+        )
+        return id
+    def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
+        id = self.generate_hash_id(question) + '_sql'
+        if self.check_embedding(id, 'question_sql'):
+            print(f"Question-SQL pair having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(question + sql), {'question': question, 'sql': sql})],
+            namespace = 'question_sql'
+        )
+        return id
+    def get_related_ddl(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='ddl',
+            include_metadata=True
+        )
+        return [match['metadata']['ddl'] for match in res['matches']]
+    def get_related_documentation(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='documentation',
+            include_metadata=True
+        )
+        return [match['metadata']['doc'] for match in res['matches']]
+    def get_similar_question_sql(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='question_sql',
+            include_metadata=True
+        )
+        return [(match['metadata']['question'], match['metadata']['sql']) for match in res['matches']]
+    def get_training_data(self, **kwargs) -> pd.DataFrame:
+        list_of_data = []
+        namespaces = ['ddl', 'documentation', 'question_sql']
+        for namespace in namespaces:
+            data = self.index.query(
+            top_k=10000,
+            namespace=namespace,
+            include_metadata=True,
+            include_values=False
+            )
+            for match in data['matches']:
+                list_of_data.append(match['metadata'])
+        return pd.DataFrame(list_of_data)
+    def remove_training_data(self, id: str, **kwargs) -> bool:
+        if id.endswith("_ddl"):
+            self.Index.delete(ids=[id], namespace="_ddl")
+            return True
+        if id.endswith("_sql"):
+            self.index.delete(ids=[id], namespace="_sql")
+            return True
+        if id.endswith("_doc"):
+            self.Index.delete(ids=[id], namespace="_doc")
+            return True
+        return False
+    def generate_embedding(self, text, **kwargs):
+        # Implement the method here
+        pass
+    def get_sql_prompt(
+            self,
+            initial_prompt : str,
+            question: str,
+            question_sql_list: list,
+            ddl_list: list,
+            doc_list: list,
+            **kwargs,
+        ):
+            """
+            Example:
+            ```python
+            vn.get_sql_prompt(
+                question="What are the top 10 customers by sales?",
+                question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
+                ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
+                doc_list=["The customers table contains information about customers and their sales."],
+            )
+            ```
+            This method is used to generate a prompt for the LLM to generate SQL.
+            Args:
+                question (str): The question to generate SQL for.
+                question_sql_list (list): A list of questions and their corresponding SQL statements.
+                ddl_list (list): A list of DDL statements.
+                doc_list (list): A list of documentation.
+            Returns:
+                any: The prompt for the LLM to generate SQL.
+            """
+            if initial_prompt is None:
+                initial_prompt = f"You are a {self.dialect} expert. " + \
+                "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
+            initial_prompt = self.add_ddl_to_prompt(
+                initial_prompt, ddl_list, max_tokens=self.max_tokens
+            )
+            if self.static_documentation != "":
+                doc_list.append(self.static_documentation)
+            initial_prompt = self.add_documentation_to_prompt(
+                initial_prompt, doc_list, max_tokens=self.max_tokens
+            )
+            # initial_prompt = self.add_sql_to_prompt(
+            #     initial_prompt, question_sql_list, max_tokens=self.max_tokens
+            # )
+            initial_prompt += (
+                "===Response Guidelines \n"
+                "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
+                "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
+                "3. If the provided context is insufficient, please give a sql query based on your knowledge and the context provided. \n"
+                "4. Please use the most relevant table(s). \n"
+                "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
+                f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+                f"7. Add a description of the table in the result of the sql query, if relevant. \n"
+                "8 Make sure to include the relevant KPI in the SQL query. The query should return impactfull data \n"
+                # f"8. If a set of latitude,longitude is provided, make a intermediate query to find the nearest value in the table and replace the coordinates in the sql query. \n"
+                # "7. Add a description of the table in the result of the sql query."
+                # "7. If the question is about a specific latitude, longitude, query an interval of 0.3 and keep only the first set of coordinate. \n"
+                # "7. Table names should be included in the result of the sql query. Use for example Mean_winter_temperature AS table_name in the query \n"
+            )
+            message_log = [self.system_message(initial_prompt)]
+            for example in question_sql_list:
+                if example is None:
+                    print("example is None")
+                else:
+                    if example is not None and "question" in example and "sql" in example:
+                        message_log.append(self.user_message(example["question"]))
+                        message_log.append(self.assistant_message(example["sql"]))
+            message_log.append(self.user_message(question))
+            return message_log
+# def get_sql_prompt(
+#         self,
+#         initial_prompt : str,
+#         question: str,
+#         question_sql_list: list,
+#         ddl_list: list,
+#         doc_list: list,
+#         **kwargs,
+#     ):
+#         """
+#         Example:
+#         ```python
+#         vn.get_sql_prompt(
+#             question="What are the top 10 customers by sales?",
+#             question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
+#             ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
+#             doc_list=["The customers table contains information about customers and their sales."],
+#         )
+#         ```
+#         This method is used to generate a prompt for the LLM to generate SQL.
+#         Args:
+#             question (str): The question to generate SQL for.
+#             question_sql_list (list): A list of questions and their corresponding SQL statements.
+#             ddl_list (list): A list of DDL statements.
+#             doc_list (list): A list of documentation.
+#         Returns:
+#             any: The prompt for the LLM to generate SQL.
+#         """
+#         if initial_prompt is None:
+#             initial_prompt = f"You are a {self.dialect} expert. " + \
+#             "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
+#         initial_prompt = self.add_ddl_to_prompt(
+#             initial_prompt, ddl_list, max_tokens=self.max_tokens
+#         )
+#         if self.static_documentation != "":
+#             doc_list.append(self.static_documentation)
+#         initial_prompt = self.add_documentation_to_prompt(
+#             initial_prompt, doc_list, max_tokens=self.max_tokens
+#         )
+#         initial_prompt += (
+#             "===Response Guidelines \n"
+#             "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
+#             "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
+#             "3. If the provided context is insufficient, please explain why it can't be generated. \n"
+#             "4. Please use the most relevant table(s). \n"
+#             "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
+#             f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+#         )
+#         message_log = [self.system_message(initial_prompt)]
+#         for example in question_sql_list:
+#             if example is None:
+#                 print("example is None")
+#             else:
+#                 if example is not None and "question" in example and "sql" in example:
+#                     message_log.append(self.user_message(example["question"]))
+#                     message_log.append(self.assistant_message(example["sql"]))
+#         message_log.append(self.user_message(question))
+#         return message_log