Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,19 @@ from analyze import analyzer, get_column_description, get_columns_with_strings,
|
|
| 11 |
|
| 12 |
MAX_ROWS = 100
|
| 13 |
T = TypeVar("T")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
| 16 |
batch_size = 100
|
|
@@ -52,7 +65,7 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
|
|
| 52 |
for presidio_entity in presidio_scan_entities(
|
| 53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 54 |
):
|
| 55 |
-
if presidio_entity
|
| 56 |
presidio_entities.append(presidio_entity)
|
| 57 |
yield f"⚙️ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
| 58 |
yield f"✅ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
|
@@ -68,8 +81,8 @@ with gr.Blocks() as demo:
|
|
| 68 |
),
|
| 69 |
gr.CheckBoxGroup(
|
| 70 |
label="Presidio entities",
|
| 71 |
-
choices=analyzer.get_supported_entities(),
|
| 72 |
-
value=
|
| 73 |
interative=True,
|
| 74 |
),
|
| 75 |
]
|
|
@@ -80,7 +93,14 @@ with gr.Blocks() as demo:
|
|
| 80 |
]
|
| 81 |
button.click(analyze_dataset, inputs, outputs)
|
| 82 |
gr.Examples(
|
| 83 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
inputs,
|
| 85 |
outputs,
|
| 86 |
fn=analyze_dataset,
|
|
|
|
| 11 |
|
| 12 |
MAX_ROWS = 100
|
| 13 |
T = TypeVar("T")
|
| 14 |
+
DEFAULT_PRESIDIO_ENTITIES = sorted([
|
| 15 |
+
'PERSON',
|
| 16 |
+
'CREDIT_CARD',
|
| 17 |
+
'US_SSN',
|
| 18 |
+
'US_DRIVER_LICENSE',
|
| 19 |
+
'PHONE_NUMBER',
|
| 20 |
+
'US_PASSPORT',
|
| 21 |
+
'EMAIL_ADDRESS',
|
| 22 |
+
'IP_ADDRESS',
|
| 23 |
+
'US_BANK_NUMBER',
|
| 24 |
+
'IBAN_CODE',
|
| 25 |
+
'EMAIL',
|
| 26 |
+
])
|
| 27 |
|
| 28 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
| 29 |
batch_size = 100
|
|
|
|
| 65 |
for presidio_entity in presidio_scan_entities(
|
| 66 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 67 |
):
|
| 68 |
+
if presidio_entity["type"] in enabled_presidio_entities:
|
| 69 |
presidio_entities.append(presidio_entity)
|
| 70 |
yield f"⚙️ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
| 71 |
yield f"✅ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
|
|
|
| 81 |
),
|
| 82 |
gr.CheckBoxGroup(
|
| 83 |
label="Presidio entities",
|
| 84 |
+
choices=sorted(analyzer.get_supported_entities()),
|
| 85 |
+
value=DEFAULT_PRESIDIO_ENTITIES,
|
| 86 |
interative=True,
|
| 87 |
),
|
| 88 |
]
|
|
|
|
| 93 |
]
|
| 94 |
button.click(analyze_dataset, inputs, outputs)
|
| 95 |
gr.Examples(
|
| 96 |
+
[
|
| 97 |
+
["microsoft/orca-math-word-problems-200k", DEFAULT_PRESIDIO_ENTITIES],
|
| 98 |
+
["tatsu-lab/alpaca", DEFAULT_PRESIDIO_ENTITIES],
|
| 99 |
+
["Anthropic/hh-rlhf", DEFAULT_PRESIDIO_ENTITIES],
|
| 100 |
+
["OpenAssistant/oasst1", DEFAULT_PRESIDIO_ENTITIES],
|
| 101 |
+
["sidhq/email-thread-summary", DEFAULT_PRESIDIO_ENTITIES],
|
| 102 |
+
["lhoestq/fake_name_and_ssn", DEFAULT_PRESIDIO_ENTITIES]
|
| 103 |
+
],
|
| 104 |
inputs,
|
| 105 |
outputs,
|
| 106 |
fn=analyze_dataset,
|