Spaces:

BridgeAI-Lab
/

Sem-nCG

Running

App Files Files Community

nbansal commited on Jul 20, 2024

Commit

e0e4e28

1 Parent(s): 17b14df

Handled the edge cases and added better error message

Browse files

Files changed (3) hide show

semncg.py +21 -19
tests.py +59 -11
utils.py +55 -34

semncg.py CHANGED Viewed

@@ -308,29 +308,31 @@ def _validate_input_format(
         >>> _validate_input_format(tokenize_sentences, predictions, references, documents)
     """
     if not (len(predictions) == len(references) == len(documents)):
-        raise ValueError("Predictions, References and Documents must have the same length.")
     if len(predictions) == 0:
         raise ValueError("Can't have empty inputs")
-    def is_list_of_strings_at_depth(lst_obj, depth: int):
-        return is_nested_list_of_type(lst_obj, element_type=str, depth=depth)
-    if tokenize_sentences:
-        condition = (
-                is_list_of_strings_at_depth(predictions, 1) and
-                is_list_of_strings_at_depth(references, 1) and
-                is_list_of_strings_at_depth(documents, 1)
-        )
-    else:
-        condition = (
-                is_list_of_strings_at_depth(predictions, 2) and
-                is_list_of_strings_at_depth(references, 2) and
-                is_list_of_strings_at_depth(documents, 2)
-        )
-    if not condition:
-        raise ValueError("Predictions, References and Documents are not valid input format. Refer to documentation.")
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)

         >>> _validate_input_format(tokenize_sentences, predictions, references, documents)
     """
     if not (len(predictions) == len(references) == len(documents)):
+        raise ValueError(
+            f"Predictions, References and Documents must have the same length. "
+            f"Got {len(predictions)} predictions, {len(references)} references and {len(documents)} documents."
+        )
     if len(predictions) == 0:
         raise ValueError("Can't have empty inputs")
+    def check_format(lst_obj, expected_depth: int, name: str):
+        is_valid, error_message = is_nested_list_of_type(lst_obj, element_type=str, depth=expected_depth)
+        if not is_valid:
+            raise ValueError(f"{name} are not in the expected format.\n"
+                             f"Error: {error_message}.")
+    try:
+        if tokenize_sentences:
+            check_format(predictions, expected_depth=1, name="predictions")
+            check_format(references, expected_depth=1, name="references")
+            check_format(documents, expected_depth=1, name="documents")
+        else:
+            check_format(predictions, expected_depth=2, name="predictions")
+            check_format(references, expected_depth=2, name="references")
+            check_format(documents, expected_depth=2, name="documents")
+    except ValueError as ve:
+        raise ValueError(f"Input validation error: {ve}")
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)

tests.py CHANGED Viewed

@@ -139,29 +139,35 @@ class TestUtils(unittest.TestCase):
     def test_is_nested_list_of_type(self):
         # Test case: Depth 0, single element matching element_type
-        self.assertTrue(is_nested_list_of_type("test", str, 0))
         # Test case: Depth 0, single element not matching element_type
-        self.assertFalse(is_nested_list_of_type("test", int, 0))
         # Test case: Depth 1, list of elements matching element_type
-        self.assertTrue(is_nested_list_of_type(["apple", "banana"], str, 1))
         # Test case: Depth 1, list of elements not matching element_type
-        self.assertFalse(is_nested_list_of_type([1, 2, 3], str, 1))
         # Test case: Depth 0 (Wrong), list of elements matching element_type
-        self.assertFalse(is_nested_list_of_type([1, 2, 3], str, 0))
         # Depth 2
-        self.assertTrue(is_nested_list_of_type([[1, 2], [3, 4]], int, 2))
-        self.assertTrue(is_nested_list_of_type([['1', '2'], ['3', '4']], str, 2))
-        self.assertFalse(is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2))
         # Depth 3
-        self.assertFalse(is_nested_list_of_type([[[1], [2]], [[3], [4]]], list, 3))
-        self.assertTrue(is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3))
         with self.assertRaises(ValueError):
             is_nested_list_of_type([1, 2], int, -1)
@@ -358,7 +364,7 @@ class TestValidateInputFormat(unittest.TestCase):
             _validate_input_format(tokenize_sentences, predictions, references, documents_invalid)
-class TestSemnCG(unittest.TestCase):
     def setUp(self):
         self.model_name = "stsb-distilbert-base"
         self.metric = SemNCG(self.model_name)
@@ -424,6 +430,48 @@ class TestSemnCG(unittest.TestCase):
         with self.assertRaises(ValueError):
             self.metric.compute(predictions=predictions, references=references, documents=documents)
 if __name__ == '__main__':
     unittest.main(verbosity=2)

     def test_is_nested_list_of_type(self):
         # Test case: Depth 0, single element matching element_type
+        self.assertEqual(is_nested_list_of_type("test", str, 0), (True, ""))
         # Test case: Depth 0, single element not matching element_type
+        is_valid, err_msg = is_nested_list_of_type("test", int, 0)
+        self.assertEqual(is_valid, False)
         # Test case: Depth 1, list of elements matching element_type
+        self.assertEqual(is_nested_list_of_type(["apple", "banana"], str, 1), (True, ""))
         # Test case: Depth 1, list of elements not matching element_type
+        is_valid, err_msg = is_nested_list_of_type([1, 2, 3], str, 1)
+        self.assertEqual(is_valid, False)
         # Test case: Depth 0 (Wrong), list of elements matching element_type
+        is_valid, err_msg = is_nested_list_of_type([1, 2, 3], str, 0)
+        self.assertEqual(is_valid, False)
         # Depth 2
+        self.assertEqual(is_nested_list_of_type([[1, 2], [3, 4]], int, 2), (True, ""))
+        self.assertEqual(is_nested_list_of_type([['1', '2'], ['3', '4']], str, 2), (True, ""))
+        is_valid, err_msg = is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2)
+        self.assertEqual(is_valid, False)
         # Depth 3
+        is_valid, err_msg = is_nested_list_of_type([[[1], [2]], [[3], [4]]], list, 3)
+        self.assertEqual(is_valid, False)
+        self.assertEqual(is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3), (True, ""))
+        # Test case: Depth is negative, expecting ValueError
         with self.assertRaises(ValueError):
             is_nested_list_of_type([1, 2], int, -1)
             _validate_input_format(tokenize_sentences, predictions, references, documents_invalid)
+class TestSemNCG(unittest.TestCase):
     def setUp(self):
         self.model_name = "stsb-distilbert-base"
         self.metric = SemNCG(self.model_name)
         with self.assertRaises(ValueError):
             self.metric.compute(predictions=predictions, references=references, documents=documents)
+    def test_bad_inputs(self):
+        def _call_metric(preds, refs, docs, tok):
+            with self.assertRaises(Exception) as ctx:
+                _ = self.metric.compute(
+                    predictions=preds,
+                    references=refs,
+                    documents=docs,
+                    tokenize_sentences=tok,
+                    pre_compute_embeddings=True,
+                )
+            print(f"Raised Exception with message: {ctx.exception}")
+            return ""
+        # None Inputs
+        # Case I
+        tokenize_sentences = True
+        predictions = [None]
+        references = ["A cat was sitting on a mat."]
+        documents = ["There was a cat on a mat."]
+        print(f"Case I\n{_call_metric(predictions, references, documents, tokenize_sentences)}\n")
+        # Case II
+        tokenize_sentences = False
+        predictions = [["A cat was sitting on a mat.", None]]
+        references = [["A cat was sitting on a mat.", "A cat was sitting on a mat."]]
+        documents = [["There was a cat on a mat.", "There was a cat on a mat."]]
+        print(f"Case II\n{_call_metric(predictions, references, documents, tokenize_sentences)}\n")
+        # Empty Input
+        tokenize_sentences = True
+        predictions = []
+        references = ["A cat was sitting on a mat."]
+        documents = ["There was a cat on a mat."]
+        print(f"Case: Empty Input\n{_call_metric(predictions, references, documents, tokenize_sentences)}\n")
+        # Empty String Input
+        tokenize_sentences = True
+        predictions = [""]
+        references = ["A cat was sitting on a mat."]
+        documents = ["There was a cat on a mat."]
+        print(f"Case: Empty String Input\n{_call_metric(predictions, references, documents, tokenize_sentences)}\n")
 if __name__ == '__main__':
     unittest.main(verbosity=2)

utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import string
-from typing import List, Union
 import nltk
 import torch
@@ -167,45 +167,66 @@ def flatten_list(nested_list: list) -> list:
     return flat_list
-def is_nested_list_of_type(lst_obj, element_type, depth: int) -> bool:
     """
-        Check if the given object is a nested list of a specific type up to a specified depth.
-        Args:
-        - lst_obj: The object to check, expected to be a list or a single element.
-        - element_type: The type that each element in the nested list should match.
-        - depth (int): The depth of nesting to check. Must be non-negative.
-        Returns:
-        - bool: True if lst_obj is a nested list of the specified type up to the given depth, False otherwise.
-        Raises:
-        - ValueError: If depth is negative.
-        Example:
-        ```python
-        # Test cases
-        is_nested_list_of_type("test", str, 0)  # Returns True
-        is_nested_list_of_type([1, 2, 3], str, 0)  # Returns False
-        is_nested_list_of_type(["apple", "banana"], str, 1)  # Returns True
-        is_nested_list_of_type([[1, 2], [3, 4]], int, 2)  # Returns True
-        is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2)  # Returns False
-        is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3)  # Returns True
-        ```
-        Explanation:
-        - The function checks if `lst_obj` is a nested list of elements of type `element_type` up to `depth` levels deep.
-        - If `depth` is 0, it checks if `lst_obj` itself is of type `element_type`.
-        - If `depth` is greater than 0, it recursively checks each level of nesting to ensure all elements match `element_type`.
-        - Raises a `ValueError` if `depth` is negative, as depth must be a non-negative integer.
     """
-    if depth == 0:
-        return isinstance(lst_obj, element_type)
-    elif depth > 0:
-        return isinstance(lst_obj, list) and all(
-            is_nested_list_of_type(item, element_type, depth - 1) for item in lst_obj)
-    else:
-        raise ValueError("Depth can't be negative")
 def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> EmbeddingSlicesType:

 import string
+from typing import List, Union, Tuple
 import nltk
 import torch
     return flat_list
+def is_nested_list_of_type(lst_obj, element_type, depth: int) -> Tuple[bool, str]:
     """
+    Check if the given object is a nested list of a specific type up to a specified depth.
+    Args:
+    - lst_obj: The object to check, expected to be a list or a single element.
+    - element_type: The type that each element in the nested list should match.
+    - depth (int): The depth of nesting to check. Must be non-negative.
+    Returns:
+    - Tuple[bool, str]: A tuple containing:
+        - A boolean indicating if lst_obj is a nested list of the specified type up to the given depth.
+        - A string containing an error message if the check fails, or an empty string if the check passes.
+    Raises:
+    - ValueError: If depth is negative.
+    Example:
+    ```python
+    # Test cases
+    is_nested_list_of_type("test", str, 0)  # Returns (True, "")
+    is_nested_list_of_type([1, 2, 3], str, 0)  # Returns (False, "Element is of type int, expected type str.")
+    is_nested_list_of_type(["apple", "banana"], str, 1)  # Returns (True, "")
+    is_nested_list_of_type([[1, 2], [3, 4]], int, 2)  # Returns (True, "")
+    is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2)  # Returns (False, "Element at index 1 is of incorrect type.")
+    is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3)  # Returns (True, "")
+    ```
+    Explanation:
+    - The function checks if `lst_obj` is a nested list of elements of type `element_type` up to `depth` levels deep.
+    - If `depth` is 0, it checks if `lst_obj` itself is of type `element_type`.
+    - If `depth` is greater than 0, it recursively checks each level of nesting to ensure all elements match
+    `element_type`.
+    - Returns a tuple containing a boolean and an error message. The boolean is `True` if `lst_obj` matches the
+    criteria, `False` otherwise. The error message provides details if the check fails.
+    - Raises a `ValueError` if `depth` is negative, as depth must be a non-negative integer.
     """
+    orig_depth = depth
+    def _is_nested_list_of_type(lst_o, e_type, d) -> Tuple[bool, str]:
+        if d == 0:
+            if isinstance(lst_o, e_type):
+                return True, ""
+            else:
+                return False, f"Element is of type {type(lst_o).__name__}, expected type {e_type.__name__}."
+        elif d > 0:
+            if isinstance(lst_o, list):
+                for i, item in enumerate(lst_o):
+                    is_valid, err = _is_nested_list_of_type(item, e_type, d - 1)
+                    if not is_valid:
+                        msg = (f"Element at index {i} has incorrect type.\nGiven Element at index {i}: {lst_o[i]}"
+                               f"\n{err}") if d == orig_depth else err
+                        return False, msg
+                return True, ""
+            else:
+                return False, f"Object is not a list but {type(lst_o)}."
+        else:
+            raise ValueError("Depth can't be negative")
+    return _is_nested_list_of_type(lst_obj, element_type, depth)
 def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> EmbeddingSlicesType: