Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

armanddemasson commited on Apr 24

Commit

5b14820

1 Parent(s): 764c3bd

refactor: modularized talk to data

Browse files

Files changed (12) hide show

climateqa/engine/talk_to_data/{config.py → drias/config.py} +3 -2
climateqa/engine/talk_to_data/{plot.py → drias/plots.py} +12 -36
climateqa/engine/talk_to_data/{sql_query.py → drias/queries.py} +0 -34
climateqa/engine/talk_to_data/{utils.py → input_processing.py} +88 -151
climateqa/engine/talk_to_data/main.py +3 -64
climateqa/engine/talk_to_data/objects/llm_outputs.py +13 -0
climateqa/engine/talk_to_data/objects/location.py +7 -0
climateqa/engine/talk_to_data/objects/plot.py +21 -0
climateqa/engine/talk_to_data/objects/states.py +46 -0
climateqa/engine/talk_to_data/query.py +52 -0
climateqa/engine/talk_to_data/{talk_to_drias.py → workflow/drias.py} +12 -193
front/tabs/tab_drias.py +1 -1

climateqa/engine/talk_to_data/{config.py → drias/config.py} RENAMED Viewed

@@ -1,3 +1,4 @@
 DRIAS_TABLES = [
     "total_winter_precipitation",
     "total_summer_precipiation",
@@ -15,7 +16,7 @@ DRIAS_TABLES = [
     "number_of_days_with_a_dry_ground",
 ]
-INDICATOR_COLUMNS_PER_TABLE = {
     "total_winter_precipitation": "total_winter_precipitation",
     "total_summer_precipiation": "total_summer_precipitation",
     "total_annual_precipitation": "total_annual_precipitation",
@@ -52,7 +53,7 @@ DRIAS_MODELS = [
     'CCLM4-8-17_HadGEM2-ES'
 ]
 # Mapping between indicator columns and their units
-INDICATOR_TO_UNIT = {
     "total_winter_precipitation": "mm",
     "total_summer_precipitation": "mm",
     "total_annual_precipitation": "mm",

 DRIAS_TABLES = [
     "total_winter_precipitation",
     "total_summer_precipiation",
     "number_of_days_with_a_dry_ground",
 ]
+DRIAS_INDICATOR_COLUMNS_PER_TABLE = {
     "total_winter_precipitation": "total_winter_precipitation",
     "total_summer_precipiation": "total_summer_precipitation",
     "total_annual_precipitation": "total_annual_precipitation",
     'CCLM4-8-17_HadGEM2-ES'
 ]
 # Mapping between indicator columns and their units
+DRIAS_INDICATOR_TO_UNIT = {
     "total_winter_precipitation": "mm",
     "total_summer_precipitation": "mm",
     "total_annual_precipitation": "mm",

climateqa/engine/talk_to_data/{plot.py → drias/plots.py} RENAMED Viewed

@@ -1,38 +1,15 @@
-from typing import Callable, TypedDict
-from matplotlib.figure import figaspect
 import pandas as pd
 from plotly.graph_objects import Figure
 import plotly.graph_objects as go
-import plotly.express as px
-from climateqa.engine.talk_to_data.sql_query import (
     indicator_for_given_year_query,
     indicator_per_year_at_location_query,
 )
-from climateqa.engine.talk_to_data.config import INDICATOR_TO_UNIT
-class Plot(TypedDict):
-    """Represents a plot configuration in the DRIAS system.
-    This class defines the structure for configuring different types of plots
-    that can be generated from climate data.
-    Attributes:
-        name (str): The name of the plot type
-        description (str): A description of what the plot shows
-        params (list[str]): List of required parameters for the plot
-        plot_function (Callable[..., Callable[..., Figure]]): Function to generate the plot
-        sql_query (Callable[..., str]): Function to generate the SQL query for the plot
-    """
-    name: str
-    description: str
-    params: list[str]
-    plot_function: Callable[..., Callable[..., Figure]]
-    sql_query: Callable[..., str]
 def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
     """Generates a function to plot indicator evolution over time at a location.
@@ -61,7 +38,7 @@ def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
     indicator = params["indicator_column"]
     location = params["location"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         """Generates the actual plot from the data.
@@ -184,7 +161,7 @@ def plot_indicator_number_of_days_per_year_at_location(
     indicator = params["indicator_column"]
     location = params["location"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         """Generate the figure thanks to the dataframe
@@ -266,7 +243,7 @@ def plot_distribution_of_indicator_for_given_year(
     indicator = params["indicator_column"]
     year = params["year"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         """Generate the figure thanks to the dataframe
@@ -347,7 +324,7 @@ def plot_map_of_france_of_indicator_for_given_year(
     indicator = params["indicator_column"]
     year = params["year"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         fig = go.Figure()
@@ -409,10 +386,9 @@ map_of_france_of_indicator_for_given_year: Plot = {
     "sql_query": indicator_for_given_year_query,
 }
-PLOTS = [
     indicator_evolution_at_location,
     indicator_number_of_days_per_year_at_location,
     distribution_of_indicator_for_given_year,
     map_of_france_of_indicator_for_given_year,
-]

+import os
+from typing import Callable
 import pandas as pd
 from plotly.graph_objects import Figure
 import plotly.graph_objects as go
+from climateqa.engine.talk_to_data.objects.plot import Plot
+from climateqa.engine.talk_to_data.drias.queries import (
     indicator_for_given_year_query,
     indicator_per_year_at_location_query,
 )
+from climateqa.engine.talk_to_data.drias.config import DRIAS_INDICATOR_TO_UNIT
 def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
     """Generates a function to plot indicator evolution over time at a location.
     indicator = params["indicator_column"]
     location = params["location"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         """Generates the actual plot from the data.
     indicator = params["indicator_column"]
     location = params["location"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         """Generate the figure thanks to the dataframe
     indicator = params["indicator_column"]
     year = params["year"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         """Generate the figure thanks to the dataframe
     indicator = params["indicator_column"]
     year = params["year"]
     indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
+    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
     def plot_data(df: pd.DataFrame) -> Figure:
         fig = go.Figure()
     "sql_query": indicator_for_given_year_query,
 }
+DRIAS_PLOTS = [
     indicator_evolution_at_location,
     indicator_number_of_days_per_year_at_location,
     distribution_of_indicator_for_given_year,
     map_of_france_of_indicator_for_given_year,
+]

climateqa/engine/talk_to_data/{sql_query.py → drias/queries.py} RENAMED Viewed

@@ -1,37 +1,4 @@
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
 from typing import TypedDict
-import duckdb
-import pandas as pd
-async def execute_sql_query(sql_query: str) -> pd.DataFrame:
-    """Executes a SQL query on the DRIAS database and returns the results.
-    This function connects to the DuckDB database containing DRIAS climate data
-    and executes the provided SQL query. It handles the database connection and
-    returns the results as a pandas DataFrame.
-    Args:
-        sql_query (str): The SQL query to execute
-    Returns:
-        pd.DataFrame: A DataFrame containing the query results
-    Raises:
-        duckdb.Error: If there is an error executing the SQL query
-    """
-    def _execute_query():
-        # Execute the query
-        con = duckdb.connect()
-        results = con.sql(sql_query).fetchdf()
-        # return fetched data
-        return results
-    # Run the query in a thread pool to avoid blocking
-    loop = asyncio.get_event_loop()
-    with ThreadPoolExecutor() as executor:
-        return await loop.run_in_executor(executor, _execute_query)
 class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
     """Parameters for querying an indicator's values over time at a location.
@@ -50,7 +17,6 @@ class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
     longitude: str
     model: str
 def indicator_per_year_at_location_query(
     table: str, params: IndicatorPerYearAtLocationQueryParams
 ) -> str:

 from typing import TypedDict
 class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
     """Parameters for querying an indicator's values over time at a location.
     longitude: str
     model: str
 def indicator_per_year_at_location_query(
     table: str, params: IndicatorPerYearAtLocationQueryParams
 ) -> str:

climateqa/engine/talk_to_data/{utils.py → input_processing.py} RENAMED Viewed

@@ -1,13 +1,14 @@
-import re
-from typing import Annotated, TypedDict
-import duckdb
-from geopy.geocoders import Nominatim
 import ast
-from climateqa.engine.llm import get_llm
-from climateqa.engine.talk_to_data.config import DRIAS_TABLES
-from climateqa.engine.talk_to_data.plot import PLOTS, Plot
 from langchain_core.prompts import ChatPromptTemplate
 async def detect_location_with_openai(sentence):
     """
@@ -29,63 +30,7 @@ async def detect_location_with_openai(sentence):
     else:
         return ""
-class ArrayOutput(TypedDict):
-    """Represents the output of a function that returns an array.
-    This class is used to type-hint functions that return arrays,
-    ensuring consistent return types across the codebase.
-    Attributes:
-        array (str): A syntactically valid Python array string
-    """
-    array: Annotated[str, "Syntactically valid python array."]
-async def detect_year_with_openai(sentence: str) -> str:
-    """
-    Detects years in a sentence using OpenAI's API via LangChain.
-    """
-    llm = get_llm()
-    prompt = """
-    Extract all years mentioned in the following sentence.
-    Return the result as a Python list. If no year are mentioned, return an empty list.
-    Sentence: "{sentence}"
-    """
-    prompt = ChatPromptTemplate.from_template(prompt)
-    structured_llm = llm.with_structured_output(ArrayOutput)
-    chain = prompt | structured_llm
-    response: ArrayOutput = await chain.ainvoke({"sentence": sentence})
-    years_list = eval(response['array'])
-    if len(years_list) > 0:
-        return years_list[0]
-    else:
-        return ""
-def detectTable(sql_query: str) -> list[str]:
-    """Extracts table names from a SQL query.
-    This function uses regular expressions to find all table names
-    referenced in a SQL query's FROM clause.
-    Args:
-        sql_query (str): The SQL query to analyze
-    Returns:
-        list[str]: A list of table names found in the query
-    Example:
-        >>> detectTable("SELECT * FROM temperature_data WHERE year > 2000")
-        ['temperature_data']
-    """
-    pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
-    matches = re.findall(pattern, sql_query)
-    return matches
-def loc2coords(location: str) -> tuple[float, float]:
     """Converts a location name to geographic coordinates.
     This function uses the Nominatim geocoding service to convert
@@ -105,32 +50,7 @@ def loc2coords(location: str) -> tuple[float, float]:
     return (coords.latitude, coords.longitude)
-def coords2loc(coords: tuple[float, float]) -> str:
-    """Converts geographic coordinates to a location name.
-    This function uses the Nominatim reverse geocoding service to convert
-    latitude and longitude coordinates to a human-readable location name.
-    Args:
-        coords (tuple[float, float]): A tuple containing (latitude, longitude)
-    Returns:
-        str: The address of the location, or "Unknown Location" if not found
-    Example:
-        >>> coords2loc((48.8566, 2.3522))
-        'Paris, France'
-    """
-    geolocator = Nominatim(user_agent="coords_to_city")
-    try:
-        location = geolocator.reverse(coords)
-        return location.address
-    except Exception as e:
-        print(f"Error: {e}")
-        return "Unknown Location"
-def nearestNeighbourSQL(location: tuple, table: str) -> tuple[str, str]:
     long = round(location[1], 3)
     lat = round(location[0], 3)
@@ -145,8 +65,31 @@ def nearestNeighbourSQL(location: tuple, table: str) -> tuple[str, str]:
     # cursor.execute(f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}")
     return results['latitude'].iloc[0], results['longitude'].iloc[0]
-async def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[str]:
     """Identifies relevant tables for a plot based on user input.
     This function uses an LLM to analyze the user's question and the plot
@@ -170,7 +113,6 @@ async def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[st
         ['mean_annual_temperature', 'mean_summer_temperature']
     """
     # Get all table names
-    table_names_list = DRIAS_TABLES
     prompt = (
         f"You are helping to build a plot following this description : {plot['description']}."
@@ -187,19 +129,9 @@ async def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[st
     )
     return table_names
-def replace_coordonates(coords, query, coords_tables):
-    n = query.count(str(coords[0]))
-    for i in range(n):
-        query = query.replace(str(coords[0]), str(coords_tables[i][0]), 1)
-        query = query.replace(str(coords[1]), str(coords_tables[i][1]), 1)
-    return query
-async def detect_relevant_plots(user_question: str, llm):
     plots_description = ""
-    for plot in PLOTS:
         plots_description += "Name: " + plot["name"]
         plots_description += " - Description: " + plot["description"] + "\n"
@@ -227,55 +159,60 @@ async def detect_relevant_plots(user_question: str, llm):
     )
     return plot_names
-# Next Version
-# class QueryOutput(TypedDict):
-#     """Generated SQL query."""
-#     query: Annotated[str, ..., "Syntactically valid SQL query."]
-# class PlotlyCodeOutput(TypedDict):
-#     """Generated Plotly code"""
-#     code: Annotated[str, ..., "Synatically valid Plotly python code."]
-# def write_sql_query(user_input: str, db: SQLDatabase, relevant_tables: list[str], llm):
-#     """Generate SQL query to fetch information."""
-#     prompt_params = {
-#         "dialect": db.dialect,
-#         "table_info": db.get_table_info(),
-#         "input": user_input,
-#         "relevant_tables": relevant_tables,
-#         "model": "ALADIN63_CNRM-CM5",
-#     }
-#     prompt = ChatPromptTemplate.from_template(query_prompt_template)
-#     structured_llm = llm.with_structured_output(QueryOutput)
-#     chain = prompt | structured_llm
-#     result = chain.invoke(prompt_params)
-#     return result["query"]
-# def fetch_data_from_sql_query(db: str, sql_query: str):
-#     conn = sqlite3.connect(db)
-#     cursor = conn.cursor()
-#     cursor.execute(sql_query)
-#     column_names = [desc[0] for desc in cursor.description]
-#     values = cursor.fetchall()
-#     return {"column_names": column_names, "data": values}
-# def generate_chart_code(user_input: str, sql_query: list[str], llm):
-#     """ "Generate plotly python code for the chart based on the sql query and the user question"""
-#     class PlotlyCodeOutput(TypedDict):
-#         """Generated Plotly code"""
-#         code: Annotated[str, ..., "Synatically valid Plotly python code."]
-#     prompt = ChatPromptTemplate.from_template(plot_prompt_template)
-#     structured_llm = llm.with_structured_output(PlotlyCodeOutput)
-#     chain = prompt | structured_llm
-#     result = chain.invoke({"input": user_input, "sql_query": sql_query})
-#     return result["code"]

+from typing import Any
 import ast
 from langchain_core.prompts import ChatPromptTemplate
+from geopy.geocoders import Nominatim
+from climateqa.engine.llm import get_llm
+import duckdb
+from climateqa.engine.talk_to_data.objects.llm_outputs import ArrayOutput
+from climateqa.engine.talk_to_data.objects.location import Location
+from climateqa.engine.talk_to_data.objects.plot import Plot
+from climateqa.engine.talk_to_data.objects.states import State
 async def detect_location_with_openai(sentence):
     """
     else:
         return ""
+def loc_to_coords(location: str) -> tuple[float, float]:
     """Converts a location name to geographic coordinates.
     This function uses the Nominatim geocoding service to convert
     return (coords.latitude, coords.longitude)
+def nearest_neighbour_sql(location: tuple, table: str) -> tuple[str, str]:
     long = round(location[1], 3)
     lat = round(location[0], 3)
     # cursor.execute(f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}")
     return results['latitude'].iloc[0], results['longitude'].iloc[0]
+async def detect_year_with_openai(sentence: str) -> str:
+    """
+    Detects years in a sentence using OpenAI's API via LangChain.
+    """
+    llm = get_llm()
+    prompt = """
+    Extract all years mentioned in the following sentence.
+    Return the result as a Python list. If no year are mentioned, return an empty list.
+    Sentence: "{sentence}"
+    """
+    prompt = ChatPromptTemplate.from_template(prompt)
+    structured_llm = llm.with_structured_output(ArrayOutput)
+    chain = prompt | structured_llm
+    response: ArrayOutput = await chain.ainvoke({"sentence": sentence})
+    years_list = eval(response['array'])
+    if len(years_list) > 0:
+        return years_list[0]
+    else:
+        return ""
+async def detect_relevant_tables(user_question: str, plot: Plot, llm, table_names_list: list[str]) -> list[str]:
     """Identifies relevant tables for a plot based on user input.
     This function uses an LLM to analyze the user's question and the plot
         ['mean_annual_temperature', 'mean_summer_temperature']
     """
     # Get all table names
     prompt = (
         f"You are helping to build a plot following this description : {plot['description']}."
     )
     return table_names
+async def detect_relevant_plots(user_question: str, llm, plot_list: list[Plot]) -> list[str]:
     plots_description = ""
+    for plot in plot_list:
         plots_description += "Name: " + plot["name"]
         plots_description += " - Description: " + plot["description"] + "\n"
     )
     return plot_names
+async def find_location(user_input: str, table: str) -> Location:
+    print(f"---- Find location in table {table} ----")
+    location = await detect_location_with_openai(user_input)
+    output: Location = {'location' : location}
+    if location:
+        coords = loc_to_coords(location)
+        neighbour = nearest_neighbour_sql(coords, table)
+        output.update({
+            "latitude": neighbour[0],
+            "longitude": neighbour[1],
+        })
+    return output
+async def find_year(user_input: str) -> str:
+    """Extracts year information from user input using LLM.
+    This function uses an LLM to identify and extract year information from the
+    user's query, which is used to filter data in subsequent queries.
+    Args:
+        user_input (str): The user's query text
+    Returns:
+        str: The extracted year, or empty string if no year found
+    """
+    print(f"---- Find year ---")
+    year = await detect_year_with_openai(user_input)
+    return year
+async def find_relevant_plots(state: State, llm, plots: list[Plot]) -> list[str]:
+    print("---- Find relevant plots ----")
+    relevant_plots = await detect_relevant_plots(state['user_input'], llm, plots)
+    return relevant_plots
+async def find_relevant_tables_per_plot(state: State, plot: Plot, llm, tables: list[str]) -> list[str]:
+    print(f"---- Find relevant tables for {plot['name']} ----")
+    relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm, tables)
+    return relevant_tables
+async def find_param(state: State, param_name:str, table: str) -> dict[str, Any] | None:
+    """Perform the good method to retrieve the desired parameter
+    Args:
+        state (State): state of the workflow
+        param_name (str): name of the desired parameter
+        table (str): name of the table
+    Returns:
+        dict[str, Any] | None:
+    """
+    if param_name == 'location':
+        location = await find_location(state['user_input'], table)
+        return location
+    if param_name == 'year':
+        year = await find_year(state['user_input'])
+        return {'year': year}
+    return None

climateqa/engine/talk_to_data/main.py CHANGED Viewed

@@ -1,43 +1,8 @@
-from climateqa.engine.talk_to_data.talk_to_drias import drias_workflow
 from climateqa.engine.llm import get_llm
 from climateqa.logging import log_drias_interaction_to_huggingface
 import ast
-llm = get_llm(provider="openai")
-def ask_llm_to_add_table_names(sql_query: str, llm) -> str:
-    """Adds table names to the SQL query result rows using LLM.
-    This function modifies the SQL query to include the source table name in each row
-    of the result set, making it easier to track which data comes from which table.
-    Args:
-        sql_query (str): The original SQL query to modify
-        llm: The language model instance to use for generating the modified query
-    Returns:
-        str: The modified SQL query with table names included in the result rows
-    """
-    sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query}. Just answer the query. The answer should not include ```sql\n").content
-    return sql_with_table_names
-def ask_llm_column_names(sql_query: str, llm) -> list[str]:
-    """Extracts column names from a SQL query using LLM.
-    This function analyzes a SQL query to identify which columns are being selected
-    in the result set.
-    Args:
-        sql_query (str): The SQL query to analyze
-        llm: The language model instance to use for column extraction
-    Returns:
-        list[str]: A list of column names being selected in the query
-    """
-    columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query}").content
-    columns_list = ast.literal_eval(columns.strip("```python\n").strip())
-    return columns_list
 async def ask_drias(query: str, index_state: int = 0, user_id: str = None) -> tuple:
     """Main function to process a DRIAS query and return results.
@@ -85,34 +50,8 @@ async def ask_drias(query: str, index_state: int = 0, user_id: str = None) -> tu
     sql_query = sql_queries[index_state]
     dataframe = result_dataframes[index_state]
-    figure = figures[index_state](dataframe)
     log_drias_interaction_to_huggingface(query, sql_query, user_id)
-    return sql_query, dataframe, figure, sql_queries, result_dataframes, figures, index_state, table_list, ""
-# def ask_vanna(vn,db_vanna_path, query):
-#     try :
-#         location = detect_location_with_openai(query)
-#         if location:
-#             coords = loc2coords(location)
-#             user_input = query.lower().replace(location.lower(), f"lat, long : {coords}")
-#             relevant_tables = detect_relevant_tables(db_vanna_path, user_input, llm)
-#             coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, relevant_tables[i]) for i in range(len(relevant_tables))]
-#             user_input_with_coords = replace_coordonates(coords, user_input, coords_tables)
-            # sql_query, result_dataframe, figure = vn.ask(user_input_with_coords, print_results=False, allow_llm_to_see_data=True, auto_train=False)
-#             return sql_query, result_dataframe, figure
-#         else :
-#             empty_df = pd.DataFrame()
-#             empty_fig = None
-#             return "", empty_df, empty_fig
-#     except Exception as e:
-#         print(f"Error: {e}")
-#         empty_df = pd.DataFrame()
-#         empty_fig = None
-#         return "", empty_df, empty_fig

+from climateqa.engine.talk_to_data.workflow.drias import drias_workflow
 from climateqa.engine.llm import get_llm
 from climateqa.logging import log_drias_interaction_to_huggingface
 import ast
 async def ask_drias(query: str, index_state: int = 0, user_id: str = None) -> tuple:
     """Main function to process a DRIAS query and return results.
     sql_query = sql_queries[index_state]
     dataframe = result_dataframes[index_state]
+    figure = figures[index_state](dataframe)
     log_drias_interaction_to_huggingface(query, sql_query, user_id)
+    return sql_query, dataframe, figure, sql_queries, result_dataframes, figures, index_state, table_list, ""

climateqa/engine/talk_to_data/objects/llm_outputs.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from typing import Annotated, TypedDict
+class ArrayOutput(TypedDict):
+    """Represents the output of a function that returns an array.
+    This class is used to type-hint functions that return arrays,
+    ensuring consistent return types across the codebase.
+    Attributes:
+        array (str): A syntactically valid Python array string
+    """
+    array: Annotated[str, "Syntactically valid python array."]

climateqa/engine/talk_to_data/objects/location.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from typing import Optional, TypedDict
+class Location(TypedDict):
+    location: str
+    latitude: Optional[str]
+    longitude: Optional[str]

climateqa/engine/talk_to_data/objects/plot.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from typing import Callable, TypedDict
+from plotly.graph_objects import Figure
+class Plot(TypedDict):
+    """Represents a plot configuration in the DRIAS system.
+    This class defines the structure for configuring different types of plots
+    that can be generated from climate data.
+    Attributes:
+        name (str): The name of the plot type
+        description (str): A description of what the plot shows
+        params (list[str]): List of required parameters for the plot
+        plot_function (Callable[..., Callable[..., Figure]]): Function to generate the plot
+        sql_query (Callable[..., str]): Function to generate the SQL query for the plot
+    """
+    name: str
+    description: str
+    params: list[str]
+    plot_function: Callable[..., Callable[..., Figure]]
+    sql_query: Callable[..., str]

climateqa/engine/talk_to_data/objects/states.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Any, Callable, Optional, TypedDict
+from plotly.graph_objects import Figure
+import pandas as pd
+class TableState(TypedDict):
+    """Represents the state of a table in the DRIAS workflow.
+    This class defines the structure for tracking the state of a table during the
+    data processing workflow, including its name, parameters, SQL query, and results.
+    Attributes:
+        table_name (str): The name of the table in the database
+        params (dict[str, Any]): Parameters used for querying the table
+        sql_query (str, optional): The SQL query used to fetch data
+        dataframe (pd.DataFrame | None, optional): The resulting data
+        figure (Callable[..., Figure], optional): Function to generate visualization
+        status (str): The current status of the table processing ('OK' or 'ERROR')
+    """
+    table_name: str
+    params: dict[str, Any]
+    sql_query: Optional[str]
+    dataframe: Optional[pd.DataFrame | None]
+    figure: Optional[Callable[..., Figure]]
+    status: str
+class PlotState(TypedDict):
+    """Represents the state of a plot in the DRIAS workflow.
+    This class defines the structure for tracking the state of a plot during the
+    data processing workflow, including its name and associated tables.
+    Attributes:
+        plot_name (str): The name of the plot
+        tables (list[str]): List of tables used in the plot
+        table_states (dict[str, TableState]): States of the tables used in the plot
+    """
+    plot_name: str
+    tables: list[str]
+    table_states: dict[str, TableState]
+class State(TypedDict):
+    user_input: str
+    plots: list[str]
+    plot_states: dict[str, PlotState]
+    error: Optional[str]

climateqa/engine/talk_to_data/query.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import duckdb
+import pandas as pd
+def find_indicator_column(table: str, indicator_columns_per_table: dict[str,str]) -> str:
+    """Retrieves the name of the indicator column within a table.
+    This function maps table names to their corresponding indicator columns
+    using the predefined mapping in INDICATOR_COLUMNS_PER_TABLE.
+    Args:
+        table (str): Name of the table in the database
+    Returns:
+        str: Name of the indicator column for the specified table
+    Raises:
+        KeyError: If the table name is not found in the mapping
+    """
+    print(f"---- Find indicator column in table {table} ----")
+    return indicator_columns_per_table[table]
+async def execute_sql_query(sql_query: str) -> pd.DataFrame:
+    """Executes a SQL query on the DRIAS database and returns the results.
+    This function connects to the DuckDB database containing DRIAS climate data
+    and executes the provided SQL query. It handles the database connection and
+    returns the results as a pandas DataFrame.
+    Args:
+        sql_query (str): The SQL query to execute
+    Returns:
+        pd.DataFrame: A DataFrame containing the query results
+    Raises:
+        duckdb.Error: If there is an error executing the SQL query
+    """
+    def _execute_query():
+        # Execute the query
+        con = duckdb.connect()
+        results = con.sql(sql_query).fetchdf()
+        # return fetched data
+        return results
+    # Run the query in a thread pool to avoid blocking
+    loop = asyncio.get_event_loop()
+    with ThreadPoolExecutor() as executor:
+        return await loop.run_in_executor(executor, _execute_query)

climateqa/engine/talk_to_data/{talk_to_drias.py → workflow/drias.py} RENAMED Viewed

@@ -1,151 +1,17 @@
 import os
-from typing import Any, Callable, TypedDict, Optional
-from numpy import sort
-import pandas as pd
 import asyncio
-from plotly.graph_objects import Figure
 from climateqa.engine.llm import get_llm
-from climateqa.engine.talk_to_data import sql_query
-from climateqa.engine.talk_to_data.config import INDICATOR_COLUMNS_PER_TABLE
-from climateqa.engine.talk_to_data.plot import PLOTS, Plot
-from climateqa.engine.talk_to_data.sql_query import execute_sql_query
-from climateqa.engine.talk_to_data.utils import (
-    detect_relevant_plots,
-    detect_year_with_openai,
-    loc2coords,
-    detect_location_with_openai,
-    nearestNeighbourSQL,
-    detect_relevant_tables,
-)
 ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
-class TableState(TypedDict):
-    """Represents the state of a table in the DRIAS workflow.
-    This class defines the structure for tracking the state of a table during the
-    data processing workflow, including its name, parameters, SQL query, and results.
-    Attributes:
-        table_name (str): The name of the table in the database
-        params (dict[str, Any]): Parameters used for querying the table
-        sql_query (str, optional): The SQL query used to fetch data
-        dataframe (pd.DataFrame | None, optional): The resulting data
-        figure (Callable[..., Figure], optional): Function to generate visualization
-        status (str): The current status of the table processing ('OK' or 'ERROR')
-    """
-    table_name: str
-    params: dict[str, Any]
-    sql_query: Optional[str]
-    dataframe: Optional[pd.DataFrame | None]
-    figure: Optional[Callable[..., Figure]]
-    status: str
-class PlotState(TypedDict):
-    """Represents the state of a plot in the DRIAS workflow.
-    This class defines the structure for tracking the state of a plot during the
-    data processing workflow, including its name and associated tables.
-    Attributes:
-        plot_name (str): The name of the plot
-        tables (list[str]): List of tables used in the plot
-        table_states (dict[str, TableState]): States of the tables used in the plot
-    """
-    plot_name: str
-    tables: list[str]
-    table_states: dict[str, TableState]
-class State(TypedDict):
-    user_input: str
-    plots: list[str]
-    plot_states: dict[str, PlotState]
-    error: Optional[str]
-async def find_relevant_plots(state: State, llm) -> list[str]:
-    print("---- Find relevant plots ----")
-    relevant_plots = await detect_relevant_plots(state['user_input'], llm)
-    return relevant_plots
-async def find_relevant_tables_per_plot(state: State, plot: Plot, llm) -> list[str]:
-    print(f"---- Find relevant tables for {plot['name']} ----")
-    relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm)
-    return relevant_tables
-async def find_param(state: State, param_name:str, table: str) -> dict[str, Any] | None:
-    """Perform the good method to retrieve the desired parameter
-    Args:
-        state (State): state of the workflow
-        param_name (str): name of the desired parameter
-        table (str): name of the table
-    Returns:
-        dict[str, Any] | None:
-    """
-    if param_name == 'location':
-        location = await find_location(state['user_input'], table)
-        return location
-    if param_name == 'year':
-        year = await find_year(state['user_input'])
-        return {'year': year}
-    return None
-class Location(TypedDict):
-    location: str
-    latitude: Optional[str]
-    longitude: Optional[str]
-async def find_location(user_input: str, table: str) -> Location:
-    print(f"---- Find location in table {table} ----")
-    location = await detect_location_with_openai(user_input)
-    output: Location = {'location' : location}
-    if location:
-        coords = loc2coords(location)
-        neighbour = nearestNeighbourSQL(coords, table)
-        output.update({
-            "latitude": neighbour[0],
-            "longitude": neighbour[1],
-        })
-    return output
-async def find_year(user_input: str) -> str:
-    """Extracts year information from user input using LLM.
-    This function uses an LLM to identify and extract year information from the
-    user's query, which is used to filter data in subsequent queries.
-    Args:
-        user_input (str): The user's query text
-    Returns:
-        str: The extracted year, or empty string if no year found
-    """
-    print(f"---- Find year ---")
-    year = await detect_year_with_openai(user_input)
-    return year
-def find_indicator_column(table: str) -> str:
-    """Retrieves the name of the indicator column within a table.
-    This function maps table names to their corresponding indicator columns
-    using the predefined mapping in INDICATOR_COLUMNS_PER_TABLE.
-    Args:
-        table (str): Name of the table in the database
-    Returns:
-        str: Name of the indicator column for the specified table
-    Raises:
-        KeyError: If the table name is not found in the mapping
-    """
-    print(f"---- Find indicator column in table {table} ----")
-    return INDICATOR_COLUMNS_PER_TABLE[table]
 async def process_table(
     table: str,
     params: dict[str, Any],
@@ -173,7 +39,7 @@ async def process_table(
         'figure': None
     }
-    table_state['params']['indicator_column'] = find_indicator_column(table)
     sql_query = plot['sql_query'](table, table_state['params'])
     if sql_query == "":
@@ -187,6 +53,7 @@ async def process_table(
     return table_state
 async def drias_workflow(user_input: str) -> State:
     """Performs the complete workflow of Talk To Drias : from user input to sql queries, dataframes and figures generated
@@ -205,7 +72,7 @@ async def drias_workflow(user_input: str) -> State:
     llm = get_llm(provider="openai")
-    plots = await find_relevant_plots(state, llm)
     state['plots'] = plots
@@ -219,7 +86,7 @@ async def drias_workflow(user_input: str) -> State:
     for plot_name in state['plots']:
-        plot = next((p for p in PLOTS if p['name'] == plot_name), None) # Find the associated plot object
         if plot is None:
             continue
@@ -231,7 +98,7 @@ async def drias_workflow(user_input: str) -> State:
         plot_state['plot_name'] = plot_name
-        relevant_tables = await find_relevant_tables_per_plot(state, plot, llm)
         if len(relevant_tables) > 0 :
             have_relevant_table = True
@@ -267,51 +134,3 @@ async def drias_workflow(user_input: str) -> State:
         state['error'] = "There is no data in our table that can answer to your question"
     return state
-# def make_write_query_node():
-#     def write_query(state):
-#         print("---- Write query ----")
-#         for table in state["tables"]:
-#             sql_query = QUERIES[state[table]['query_type']](
-#                 table=table,
-#                 indicator_column=state[table]["columns"],
-#                 longitude=state[table]["longitude"],
-#                 latitude=state[table]["latitude"],
-#             )
-#             state[table].update({"sql_query": sql_query})
-#         return state
-#     return write_query
-# def make_fetch_data_node(db_path):
-#     def fetch_data(state):
-#         print("---- Fetch data ----")
-#         for table in state["tables"]:
-#             results = execute_sql_query(db_path, state[table]['sql_query'])
-#             state[table].update(results)
-#         return state
-#     return fetch_data
-## V2
-# def make_fetch_data_node(db_path: str, llm):
-#     def fetch_data(state):
-#         print("---- Fetch data ----")
-#         db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
-#         output = {}
-#         sql_query = write_sql_query(state["query"], db, state["tables"], llm)
-#         # TO DO : Add query checker
-#         print(f"SQL query  : {sql_query}")
-#         output["sql_query"] = sql_query
-#         output.update(fetch_data_from_sql_query(db_path, sql_query))
-#         return output
-#     return fetch_data

 import os
+from typing import Any
 import asyncio
 from climateqa.engine.llm import get_llm
+from climateqa.engine.talk_to_data.input_processing import find_param, find_relevant_plots, find_relevant_tables_per_plot
+from climateqa.engine.talk_to_data.query import execute_sql_query, find_indicator_column
+from climateqa.engine.talk_to_data.objects.plot import Plot
+from climateqa.engine.talk_to_data.objects.states import PlotState, State, TableState
+from climateqa.engine.talk_to_data.drias.config import DRIAS_TABLES, DRIAS_INDICATOR_COLUMNS_PER_TABLE
+from climateqa.engine.talk_to_data.drias.plots import DRIAS_PLOTS
 ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
 async def process_table(
     table: str,
     params: dict[str, Any],
         'figure': None
     }
+    table_state['params']['indicator_column'] = find_indicator_column(table, DRIAS_INDICATOR_COLUMNS_PER_TABLE)
     sql_query = plot['sql_query'](table, table_state['params'])
     if sql_query == "":
     return table_state
 async def drias_workflow(user_input: str) -> State:
     """Performs the complete workflow of Talk To Drias : from user input to sql queries, dataframes and figures generated
     llm = get_llm(provider="openai")
+    plots = await find_relevant_plots(state, llm, DRIAS_PLOTS)
     state['plots'] = plots
     for plot_name in state['plots']:
+        plot = next((p for p in DRIAS_PLOTS if p['name'] == plot_name), None) # Find the associated plot object
         if plot is None:
             continue
         plot_state['plot_name'] = plot_name
+        relevant_tables = await find_relevant_tables_per_plot(state, plot, llm, DRIAS_TABLES)
         if len(relevant_tables) > 0 :
             have_relevant_table = True
         state['error'] = "There is no data in our table that can answer to your question"
     return state

front/tabs/tab_drias.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 import pandas as pd
 from climateqa.engine.talk_to_data.main import ask_drias
-from climateqa.engine.talk_to_data.config import DRIAS_MODELS, DRIAS_UI_TEXT
 class DriasUIElements(TypedDict):
     tab: gr.Tab

 import pandas as pd
 from climateqa.engine.talk_to_data.main import ask_drias
+from climateqa.engine.talk_to_data.drias.config import DRIAS_MODELS, DRIAS_UI_TEXT
 class DriasUIElements(TypedDict):
     tab: gr.Tab