Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Sep 5

Commit

1b92bc9

verified ·

1 Parent(s): fdd5b1f

shortening MCP tool descriptions to make context more concise

Browse files

Files changed (1) hide show

app.py +15 -59

app.py CHANGED Viewed

@@ -206,17 +206,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
 def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
-    """
-    Convert the page's main content (or body fallback) to Markdown, similar to
-    web-scraper's Content Scraper tool, but without any file download side-effects.
-    Steps:
-      - Remove noisy elements (script/style/nav/footer/header/aside)
-      - Prefer <main>, <article>, or common content containers; fallback to <body>
-      - Convert to Markdown with ATX headings
-      - Clean up excessive newlines, empty links, and whitespace
-      - Prepend a title header when available
-    """
     # Remove unwanted elements globally first
     for element in full_soup.select("script, style, nav, footer, header, aside"):
         element.decompose()
@@ -423,8 +413,7 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
 ) -> str:
     """
-    Run a DuckDuckGo search with enhanced error handling and readable text output.
-    Always returns results in human-friendly format with snippets included.
     Args:
         query (str): The search query string. Supports operators like site:, quotes for exact matching,
@@ -670,41 +659,25 @@ def List_Kokoro_Voices() -> List[str]:
 def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.25,
-    voice: Annotated[str, "Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_alice' (British female), 'jf_alpha' (Japanese female)."] = "af_heart",
 ) -> Tuple[int, np.ndarray]:
     """
-    Synthesize speech from text using the Kokoro-82M model with 54 voice options.
     This function returns raw audio suitable for a Gradio Audio component and is
     also exposed as an MCP tool. It supports 54 different voices across multiple
     languages and accents including American, British, European, Hindi, Italian,
     Japanese, Portuguese, and Chinese speakers.
-    Enhanced for longer audio generation:
-        - Processes ALL text segments (not just the first one)
-        - Can generate audio of any length based on input text
-        - Concatenates multiple segments for seamless longer audio
-    Default behavior:
-        - Speed defaults to 1.25 (slightly brisk cadence) for clearer, snappier delivery.
-        - Voice defaults to "af_heart" (American Female, Heart voice)
     Args:
         text (str): The text to synthesize. Works best with English but supports multiple languages.
         speed (float): Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
-        voice (str): Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Default: 'af_heart'.
     Returns:
         A tuple of (sample_rate_hz, audio_waveform) where:
         - sample_rate_hz: int sample rate in Hz (24_000)
         - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
-    Notes:
-        - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is raised.
-        - Runs on CUDA if available; otherwise CPU.
-        - Supports 54 voices across 9 language/accent categories.
-        - Can generate audio of any length - no 30 second limit!
-        - Use List_Kokoro_Voices() MCP tool to discover all available voice options.
     """
     _log_call_start("Generate_Speech", text=_truncate_for_log(text, 200), speed=speed, voice=voice)
     if not text or not text.strip():
@@ -793,11 +766,7 @@ fetch_interface = gr.Interface(
     ),
     api_description=(
         "Fetch a web page and return it converted to Markdown format with configurable length. "
-        "This function retrieves a webpage and converts its main content to clean Markdown, "
-        "preserving headings, formatting, and structure while removing navigation, footers, scripts, "
-        "and other non-content elements. Parameters: url (str - absolute URL), verbosity (str - "
-        "Brief/Standard/Full controlling output length: Brief=1000 chars, Standard=3000 chars, Full=complete page). "
-        "Returns clean Markdown with page title as H1 header and preserved content hierarchy."
     ),
     flagging_mode="never",
 )
@@ -815,8 +784,7 @@ concise_interface = gr.Interface(
         "<div style=\"text-align:center\">Enhanced web search with readable output format. Always includes snippets for better context and understanding.</div>"
     ),
     api_description=(
-        "Run a DuckDuckGo search with enhanced error handling and readable text output. "
-        "Always returns results in human-friendly format with snippets included for better context. "
         "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
         "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
         "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
@@ -902,11 +870,10 @@ kokoro_interface = gr.Interface(
         "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Can generate audio of any length! Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
-        "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
-        "Supports unlimited text length by processing all segments. Voice examples: 'af_heart' (US female), 'am_adam' (US male), "
-        "'bf_alice' (British female), 'bm_daniel' (British male), 'jf_alpha' (Japanese female), 'zf_xiaoni' (Chinese female). "
         "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options, default 'af_heart'). "
-        "Use List_Kokoro_Voices() to see all available voices. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
     flagging_mode="never",
@@ -935,14 +902,7 @@ def Generate_Image(  # <-- MCP tool #5 (Generate Image)
     height: Annotated[int, "Output height in pixels (64–1216, multiple of 32 recommended)."] = 1024,
 ) -> Image.Image:
     """
-    Generate a single image from a text prompt using a Hugging Face model via
-    serverless Inference. Returns a PIL image. By default, the model is
-    black-forest-labs/FLUX.1-Krea-dev.
-    Notes (MCP):
-    - Per the latest Gradio MCP docs, images returned from tools are handled by the server and
-      converted to file URLs automatically for MCP clients. Ensure type hints and this docstring
-      "Args:" block are present so the tool schema is accurate.
     Args:
         prompt (str): Text description of the image to generate.
@@ -1034,9 +994,9 @@ image_generation_interface = gr.Interface(
         "Default model is FLUX.1-Krea</div>"
     ),
     api_description=(
-        "Generate a single image from a text prompt using a Hugging Face model (serverless Inference). "
         "Supports creative prompts like 'a serene mountain landscape at sunset', 'portrait of a wise owl', "
-        "'futuristic city with flying cars'. Default model: FLUX.1-Krea-dev (high quality). "
         "Parameters: prompt (str), model_id (str, creator/model-name), negative_prompt (str), steps (int, 1–100), "
         "cfg_scale (float, 1–20), sampler (str), seed (int, -1=random), width/height (int, 64–1216). "
         "Returns a PIL.Image. Return the generated media to the user in this format `![Alt text](URL)`"
@@ -1095,11 +1055,7 @@ def Generate_Video(  # <-- MCP tool #6 (Generate Video)
     duration: Annotated[float, "Target duration in seconds (provider/model dependent, commonly 2–6s)."] = 4.0,
 ) -> str:
     """
-    Generate a short video from a text prompt using Hugging Face Inference Providers (Serverless Inference).
-    This tool follows the latest MCP guidance for Gradio-based MCP servers: clear type hints and
-    docstrings define the tool schema automatically. The returned file path will be converted to a file URL
-    for MCP clients.
     Args:
         prompt (str): Text description of the video to generate.
@@ -1228,7 +1184,7 @@ video_generation_interface = gr.Interface(
     "Default model is Wan2.2-T2V-A14B.</div>"
     ),
     api_description=(
-        "Generate a short video from a text prompt using a Hugging Face model (Serverless Inference). "
         "Create dynamic scenes like 'a red fox running through a snowy forest at sunrise', 'waves crashing on a rocky shore', "
         "'time-lapse of clouds moving across a blue sky'. Default model: Wan2.2-T2V-A14B (2-6 second videos). "
         "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "

 def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
     # Remove unwanted elements globally first
     for element in full_soup.select("script, style, nav, footer, header, aside"):
         element.decompose()
     max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
 ) -> str:
     """
+    Run a DuckDuckGo search and return numbered results with URLs, titles, and summaries.
     Args:
         query (str): The search query string. Supports operators like site:, quotes for exact matching,
 def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.25,
+    voice: Annotated[str, "Voice identifier from 54 available options."] = "af_heart",
 ) -> Tuple[int, np.ndarray]:
     """
+    Synthesize speech from text using the Kokoro-82M TTS model.
     This function returns raw audio suitable for a Gradio Audio component and is
     also exposed as an MCP tool. It supports 54 different voices across multiple
     languages and accents including American, British, European, Hindi, Italian,
     Japanese, Portuguese, and Chinese speakers.
     Args:
         text (str): The text to synthesize. Works best with English but supports multiple languages.
         speed (float): Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
+        voice (str): Voice identifier from 54 available options. Default: 'af_heart'.
     Returns:
         A tuple of (sample_rate_hz, audio_waveform) where:
         - sample_rate_hz: int sample rate in Hz (24_000)
         - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
     """
     _log_call_start("Generate_Speech", text=_truncate_for_log(text, 200), speed=speed, voice=voice)
     if not text or not text.strip():
     ),
     api_description=(
         "Fetch a web page and return it converted to Markdown format with configurable length. "
+        "Parameters: url (str - absolute URL), verbosity (str - Brief/Standard/Full controlling output length: Brief=1000 chars, Standard=3000 chars, Full=complete page)."
     ),
     flagging_mode="never",
 )
         "<div style=\"text-align:center\">Enhanced web search with readable output format. Always includes snippets for better context and understanding.</div>"
     ),
     api_description=(
+        "Run a DuckDuckGo search and return numbered results with URLs, titles, and summaries. "
         "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
         "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
         "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
         "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Can generate audio of any length! Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
+        "Synthesize speech from text using Kokoro-82M TTS model. Returns (sample_rate, waveform) suitable for playback. "
+        "Supports unlimited text length by processing all segments. Voice examples: 'af_heart' (US female), 'am_onyx' (US male), "
+        "'bf_emma' (British female), 'af_sky' (US female), 'af_nicole' (US female), "
         "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options, default 'af_heart'). "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
     flagging_mode="never",
     height: Annotated[int, "Output height in pixels (64–1216, multiple of 32 recommended)."] = 1024,
 ) -> Image.Image:
     """
+    Generate a single image from a text prompt using a Hugging Face model via serverless inference.
     Args:
         prompt (str): Text description of the image to generate.
         "Default model is FLUX.1-Krea</div>"
     ),
     api_description=(
+        "Generate a single image from a text prompt using a Hugging Face model via serverless inference. "
         "Supports creative prompts like 'a serene mountain landscape at sunset', 'portrait of a wise owl', "
+        "'futuristic city with flying cars'. Default model: FLUX.1-Krea-dev. "
         "Parameters: prompt (str), model_id (str, creator/model-name), negative_prompt (str), steps (int, 1–100), "
         "cfg_scale (float, 1–20), sampler (str), seed (int, -1=random), width/height (int, 64–1216). "
         "Returns a PIL.Image. Return the generated media to the user in this format `![Alt text](URL)`"
     duration: Annotated[float, "Target duration in seconds (provider/model dependent, commonly 2–6s)."] = 4.0,
 ) -> str:
     """
+    Generate a short video from a text prompt using a Hugging Face model via serverless inference.
     Args:
         prompt (str): Text description of the video to generate.
     "Default model is Wan2.2-T2V-A14B.</div>"
     ),
     api_description=(
+        "Generate a short video from a text prompt using a Hugging Face model via serverless inference. "
         "Create dynamic scenes like 'a red fox running through a snowy forest at sunrise', 'waves crashing on a rocky shore', "
         "'time-lapse of clouds moving across a blue sky'. Default model: Wan2.2-T2V-A14B (2-6 second videos). "
         "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "