Spaces:
Running
Running
Update curated.py
Browse files- curated.py +32 -0
curated.py
CHANGED
|
@@ -919,6 +919,30 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo")
|
|
| 919 |
)
|
| 920 |
|
| 921 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 922 |
def update(target: str, request):
|
| 923 |
params = request.query_params
|
| 924 |
if data_source := params.get(f"data_source_{target}"):
|
|
@@ -1043,6 +1067,13 @@ def curated(request):
|
|
| 1043 |
|
| 1044 |
|
| 1045 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1046 |
preprocessing_steps = pd.DataFrame(
|
| 1047 |
{
|
| 1048 |
"Step": [
|
|
@@ -1127,6 +1158,7 @@ def curated(request):
|
|
| 1127 |
plotly2fasthtml(diff2_stacked_bar),
|
| 1128 |
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
| 1129 |
filtering_process,
|
|
|
|
| 1130 |
data_preparation_div,
|
| 1131 |
#H2("Local Deduplication"), are these numbers even right?
|
| 1132 |
#local_dedup_text,
|
|
|
|
| 919 |
)
|
| 920 |
|
| 921 |
|
| 922 |
+
|
| 923 |
+
def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
|
| 924 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 925 |
+
|
| 926 |
+
if data_source == "Freelaw":
|
| 927 |
+
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
|
| 928 |
+
extracted_sample_doc = json.load(
|
| 929 |
+
open("data/curated_samples/freelaw_extract.json")
|
| 930 |
+
)
|
| 931 |
+
else:
|
| 932 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 933 |
+
|
| 934 |
+
raw_json = raw_sample_doc[doc_id]
|
| 935 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 936 |
+
return view_data(
|
| 937 |
+
raw_json,
|
| 938 |
+
extracted_json,
|
| 939 |
+
doc_id=doc_id,
|
| 940 |
+
data_source=data_source,
|
| 941 |
+
data_sources=data_sources,
|
| 942 |
+
target=target,
|
| 943 |
+
)
|
| 944 |
+
|
| 945 |
+
|
| 946 |
def update(target: str, request):
|
| 947 |
params = request.query_params
|
| 948 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
|
| 1067 |
|
| 1068 |
|
| 1069 |
|
| 1070 |
+
freelaw_examples = Div(
|
| 1071 |
+
Div(
|
| 1072 |
+
get_freelaw_data(target=gen_random_id()),
|
| 1073 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 1074 |
+
),
|
| 1075 |
+
)
|
| 1076 |
+
|
| 1077 |
preprocessing_steps = pd.DataFrame(
|
| 1078 |
{
|
| 1079 |
"Step": [
|
|
|
|
| 1158 |
plotly2fasthtml(diff2_stacked_bar),
|
| 1159 |
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
| 1160 |
filtering_process,
|
| 1161 |
+
freelaw_examples,
|
| 1162 |
data_preparation_div,
|
| 1163 |
#H2("Local Deduplication"), are these numbers even right?
|
| 1164 |
#local_dedup_text,
|