Spaces:
Build error
Build error
| """Streamlit app for Presidio.""" | |
| import json | |
| from json import JSONEncoder | |
| from annotated_text import annotated_text | |
| import pandas as pd | |
| import streamlit as st | |
| from presidio_analyzer import AnalyzerEngine, RecognizerRegistry | |
| from presidio_anonymizer import AnonymizerEngine | |
| from flair_recognizer import FlairRecognizer | |
| import spacy | |
| spacy.cli.download("en_core_web_lg") | |
| # Helper methods | |
| def analyzer_engine(): | |
| """Return AnalyzerEngine.""" | |
| flair_recognizer = FlairRecognizer() | |
| registry = RecognizerRegistry() | |
| registry.add_recognizer(flair_recognizer) | |
| registry.load_predefined_recognizers() | |
| analyzer = AnalyzerEngine(registry=registry) | |
| return analyzer | |
| def anonymizer_engine(): | |
| """Return AnonymizerEngine.""" | |
| return AnonymizerEngine() | |
| def get_supported_entities(): | |
| """Return supported entities from the Analyzer Engine.""" | |
| return analyzer_engine().get_supported_entities() | |
| def analyze(**kwargs): | |
| """Analyze input using Analyzer engine and input arguments (kwargs).""" | |
| if "entities" not in kwargs or "All" in kwargs["entities"]: | |
| kwargs["entities"] = None | |
| return analyzer_engine().analyze(**kwargs) | |
| def anonymize(text, analyze_results): | |
| """Anonymize identified input using Presidio Abonymizer.""" | |
| res = anonymizer_engine().anonymize(text, analyze_results) | |
| return res.text | |
| def annotate(text, st_analyze_results, st_entities): | |
| tokens = [] | |
| # sort by start index | |
| results = sorted(st_analyze_results, key=lambda x: x.start) | |
| for i, res in enumerate(results): | |
| if i == 0: | |
| tokens.append(text[:res.start]) | |
| # append entity text and entity type | |
| tokens.append((text[res.start: res.end], res.entity_type)) | |
| # if another entity coming i.e. we're not at the last results element, add text up to next entity | |
| if i != len(results) - 1: | |
| tokens.append(text[res.end:results[i+1].start]) | |
| # if no more entities coming, add all remaining text | |
| else: | |
| tokens.append(text[res.end:]) | |
| return tokens | |
| st.set_page_config(page_title="Presidio demo (English)", layout="wide") | |
| # Side bar | |
| st.sidebar.markdown( | |
| """ | |
| Anonymize PII entities in text with [presidio](https://aka.ms/presidio), spaCy and a [PII detection model](https://huggingface.co/beki/flair-pii-english) trained on protocol trace data generated by [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy). | |
| """ | |
| ) | |
| st_entities = st.sidebar.multiselect( | |
| label="Which entities to look for?", | |
| options=get_supported_entities(), | |
| default=list(get_supported_entities()), | |
| ) | |
| st_threshold = st.sidebar.slider( | |
| label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35 | |
| ) | |
| st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json") | |
| st.sidebar.info( | |
| "Presidio is an open source framework for PII detection and anonymization. Privy is an open source framework for synthetic data generation in protocol trace formats (json, sql, html etc)" | |
| "For more info visit [aka.ms/presidio](https://aka.ms/presidio) and [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy)" | |
| ) | |
| # Main panel | |
| analyzer_load_state = st.info("Starting Presidio analyzer and loading Privy-trained model...") | |
| engine = analyzer_engine() | |
| analyzer_load_state.empty() | |
| st_text = st.text_area( | |
| label="Type in some text", | |
| value= | |
| "like a phone number (212-141-4544) " | |
| "or a name (Lebron James).", | |
| height=200, | |
| ) | |
| # After | |
| st.subheader("Analyzed") | |
| with st.spinner("Analyzing..."): | |
| st_analyze_results = analyze( | |
| text=st_text, | |
| entities=st_entities, | |
| language="en", | |
| score_threshold=st_threshold, | |
| return_decision_process=st_return_decision_process, | |
| ) | |
| annotated_tokens = annotate(st_text, st_analyze_results, st_entities) | |
| # annotated_tokens | |
| annotated_text(*annotated_tokens) | |
| # vertical space | |
| st.text("") | |
| st.subheader("Anonymized") | |
| with st.spinner("Anonymizing..."): | |
| st_anonymize_results = anonymize(st_text, st_analyze_results) | |
| st_anonymize_results | |
| # table result | |
| st.subheader("Detailed Findings") | |
| if st_analyze_results: | |
| res_dicts = [r.to_dict() for r in st_analyze_results] | |
| for d in res_dicts: | |
| d['Value'] = st_text[d['start']:d['end']] | |
| df = pd.DataFrame.from_records(res_dicts) | |
| df = df[["entity_type", "Value", "score", "start", "end"]].rename( | |
| { | |
| "entity_type": "Entity type", | |
| "start": "Start", | |
| "end": "End", | |
| "score": "Confidence", | |
| }, | |
| axis=1, | |
| ) | |
| st.dataframe(df, width=1000) | |
| else: | |
| st.text("No findings") | |
| # json result | |
| class ToDictListEncoder(JSONEncoder): | |
| """Encode dict to json.""" | |
| def default(self, o): | |
| """Encode to JSON using to_dict.""" | |
| if o: | |
| return o.to_dict() | |
| return [] | |
| if st_return_decision_process: | |
| st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder)) | |